[llvm] 189900e - X86: Stop assigning register costs for longer encodings.
Matthias Braun via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 30 16:03:56 PDT 2022
Author: Matthias Braun
Date: 2022-09-30T16:01:33-07:00
New Revision: 189900eb149bb55ae3787346f57c1ccbdc50fb3c
URL: https://github.com/llvm/llvm-project/commit/189900eb149bb55ae3787346f57c1ccbdc50fb3c
DIFF: https://github.com/llvm/llvm-project/commit/189900eb149bb55ae3787346f57c1ccbdc50fb3c.diff
LOG: X86: Stop assigning register costs for longer encodings.
This stops reporting CostPerUse 1 for `R8`-`R15` and `XMM8`-`XMM31`.
This was previously done because instruction encoding require a REX
prefix when using them resulting in longer instruction encodings. I
found that this regresses the quality of the register allocation as the
costs impose an ordering on eviction candidates. I also feel that there
is a bit of an impedance mismatch as the actual costs occure when
encoding instructions using those registers, but the order of VReg
assignments is not primarily ordered by number of Defs+Uses.
I did extensive measurements with the llvm-test-suite wiht SPEC2006 +
SPEC2017 included, internal services showed similar patterns. Generally
there are a log of improvements but also a lot of regression. But on
average the allocation quality seems to improve at a small code size
regression.
Results for measuring static and dynamic instruction counts:
Dynamic Counts (scaled by execution frequency) / Optimization Remarks:
Spills+FoldedSpills -5.6%
Reloads+FoldedReloads -4.2%
Copies -0.1%
Static / LLVM Statistics:
regalloc.NumSpills mean -1.6%, geomean -2.8%
regalloc.NumReloads mean -1.7%, geomean -3.1%
size..text mean +0.4%, geomean +0.4%
Static / LLVM Statistics:
mean -2.2%, geomean -3.1%) regalloc.NumSpills
mean -2.6%, geomean -3.9%) regalloc.NumReloads
mean +0.6%, geomean +0.6%) size..text
Static / LLVM Statistics:
regalloc.NumSpills mean -3.0%
regalloc.NumReloads mean -3.3%
size..text mean +0.3%, geomean +0.3%
Differential Revision: https://reviews.llvm.org/D133902
Added:
Modified:
llvm/lib/Target/X86/X86RegisterInfo.td
llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll
llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
llvm/test/CodeGen/X86/AMX/amx-across-func.ll
llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll
llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
llvm/test/CodeGen/X86/AMX/amx-spill.ll
llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
llvm/test/CodeGen/X86/StackColoring.ll
llvm/test/CodeGen/X86/add-and-not.ll
llvm/test/CodeGen/X86/addcarry.ll
llvm/test/CodeGen/X86/avg.ll
llvm/test/CodeGen/X86/avoid-sfb.ll
llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/avx-load-store.ll
llvm/test/CodeGen/X86/avx512-calling-conv.ll
llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
llvm/test/CodeGen/X86/bfloat.ll
llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
llvm/test/CodeGen/X86/bitreverse.ll
llvm/test/CodeGen/X86/break-false-dep.ll
llvm/test/CodeGen/X86/bswap.ll
llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll
llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll
llvm/test/CodeGen/X86/cgp-usubo.ll
llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll
llvm/test/CodeGen/X86/combine-pmuldq.ll
llvm/test/CodeGen/X86/combine-sdiv.ll
llvm/test/CodeGen/X86/commute-fcmp.ll
llvm/test/CodeGen/X86/compact-unwind.ll
llvm/test/CodeGen/X86/conditional-tailcall.ll
llvm/test/CodeGen/X86/copy-eflags.ll
llvm/test/CodeGen/X86/ctpop-combine.ll
llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
llvm/test/CodeGen/X86/dagcombine-cse.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
llvm/test/CodeGen/X86/divmod128.ll
llvm/test/CodeGen/X86/extract-bits.ll
llvm/test/CodeGen/X86/flt-rounds.ll
llvm/test/CodeGen/X86/fma-commute-loop.ll
llvm/test/CodeGen/X86/fmaddsub-combine.ll
llvm/test/CodeGen/X86/fmaxnum.ll
llvm/test/CodeGen/X86/fminnum.ll
llvm/test/CodeGen/X86/fp-stack-2results.ll
llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
llvm/test/CodeGen/X86/fp128-select.ll
llvm/test/CodeGen/X86/fpclamptosat_vec.ll
llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
llvm/test/CodeGen/X86/gather-addresses.ll
llvm/test/CodeGen/X86/h-registers-1.ll
llvm/test/CodeGen/X86/haddsub-2.ll
llvm/test/CodeGen/X86/haddsub-4.ll
llvm/test/CodeGen/X86/hoist-invariant-load.ll
llvm/test/CodeGen/X86/i128-mul.ll
llvm/test/CodeGen/X86/load-local-v3i1.ll
llvm/test/CodeGen/X86/lrshrink.ll
llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll
llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
llvm/test/CodeGen/X86/machine-cp.ll
llvm/test/CodeGen/X86/madd.ll
llvm/test/CodeGen/X86/masked-iv-unsafe.ll
llvm/test/CodeGen/X86/masked_compressstore.ll
llvm/test/CodeGen/X86/masked_expandload.ll
llvm/test/CodeGen/X86/masked_gather.ll
llvm/test/CodeGen/X86/masked_load.ll
llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
llvm/test/CodeGen/X86/misched-matmul.ll
llvm/test/CodeGen/X86/mmx-arith.ll
llvm/test/CodeGen/X86/mul-constant-result.ll
llvm/test/CodeGen/X86/mul-i1024.ll
llvm/test/CodeGen/X86/mul-i256.ll
llvm/test/CodeGen/X86/mul-i512.ll
llvm/test/CodeGen/X86/muloti.ll
llvm/test/CodeGen/X86/musttail-varargs.ll
llvm/test/CodeGen/X86/nontemporal-loads.ll
llvm/test/CodeGen/X86/oddshuffles.ll
llvm/test/CodeGen/X86/or-address.ll
llvm/test/CodeGen/X86/paddus.ll
llvm/test/CodeGen/X86/pmul.ll
llvm/test/CodeGen/X86/pmulh.ll
llvm/test/CodeGen/X86/popcnt.ll
llvm/test/CodeGen/X86/pr18344.ll
llvm/test/CodeGen/X86/pr21792.ll
llvm/test/CodeGen/X86/pr23603.ll
llvm/test/CodeGen/X86/pr29112.ll
llvm/test/CodeGen/X86/pr32329.ll
llvm/test/CodeGen/X86/pr35316.ll
llvm/test/CodeGen/X86/pr38185.ll
llvm/test/CodeGen/X86/pr38217.ll
llvm/test/CodeGen/X86/pr43820.ll
llvm/test/CodeGen/X86/pr45563-2.ll
llvm/test/CodeGen/X86/pr45563.ll
llvm/test/CodeGen/X86/pr45995.ll
llvm/test/CodeGen/X86/pr46877.ll
llvm/test/CodeGen/X86/pr47299.ll
llvm/test/CodeGen/X86/pr47857.ll
llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll
llvm/test/CodeGen/X86/promote-cmp.ll
llvm/test/CodeGen/X86/psubus.ll
llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
llvm/test/CodeGen/X86/reverse_branches.ll
llvm/test/CodeGen/X86/sad.ll
llvm/test/CodeGen/X86/sadd_sat_vec.ll
llvm/test/CodeGen/X86/sbb-false-dep.ll
llvm/test/CodeGen/X86/scalar_widen_div.ll
llvm/test/CodeGen/X86/scheduler-backtracking.ll
llvm/test/CodeGen/X86/sdiv_fix.ll
llvm/test/CodeGen/X86/sdiv_fix_sat.ll
llvm/test/CodeGen/X86/setcc-wide-types.ll
llvm/test/CodeGen/X86/shift-i128.ll
llvm/test/CodeGen/X86/shrink_vmul.ll
llvm/test/CodeGen/X86/smul-with-overflow.ll
llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
llvm/test/CodeGen/X86/speculative-load-hardening.ll
llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/sse-intel-ocl.ll
llvm/test/CodeGen/X86/sse-regcall.ll
llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/sshl_sat.ll
llvm/test/CodeGen/X86/ssub_sat_vec.ll
llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir
llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir
llvm/test/CodeGen/X86/statepoint-live-in-remat.ll
llvm/test/CodeGen/X86/statepoint-live-in.ll
llvm/test/CodeGen/X86/statepoint-ra-no-ls.ll
llvm/test/CodeGen/X86/statepoint-regs.ll
llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll
llvm/test/CodeGen/X86/statepoint-stack-usage.ll
llvm/test/CodeGen/X86/statepoint-vreg-details.ll
llvm/test/CodeGen/X86/statepoint-vreg-invoke.ll
llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll
llvm/test/CodeGen/X86/statepoint-vreg.ll
llvm/test/CodeGen/X86/statepoint-vreg.mir
llvm/test/CodeGen/X86/subcarry.ll
llvm/test/CodeGen/X86/swifterror.ll
llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
llvm/test/CodeGen/X86/tail-opts.ll
llvm/test/CodeGen/X86/tailcallstack64.ll
llvm/test/CodeGen/X86/tailccstack64.ll
llvm/test/CodeGen/X86/twoaddr-lea.ll
llvm/test/CodeGen/X86/uadd_sat_vec.ll
llvm/test/CodeGen/X86/udiv_fix_sat.ll
llvm/test/CodeGen/X86/umul-with-overflow.ll
llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
llvm/test/CodeGen/X86/usub_sat_vec.ll
llvm/test/CodeGen/X86/var-permute-128.ll
llvm/test/CodeGen/X86/var-permute-512.ll
llvm/test/CodeGen/X86/vec_int_to_fp.ll
llvm/test/CodeGen/X86/vec_saddo.ll
llvm/test/CodeGen/X86/vec_smulo.ll
llvm/test/CodeGen/X86/vec_ssubo.ll
llvm/test/CodeGen/X86/vec_uaddo.ll
llvm/test/CodeGen/X86/vec_umulo.ll
llvm/test/CodeGen/X86/vec_usubo.ll
llvm/test/CodeGen/X86/vector-bitreverse.ll
llvm/test/CodeGen/X86/vector-compare-results.ll
llvm/test/CodeGen/X86/vector-fshl-128.ll
llvm/test/CodeGen/X86/vector-fshl-256.ll
llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-fshr-256.ll
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
llvm/test/CodeGen/X86/vector-interleave.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
llvm/test/CodeGen/X86/vector-reduce-add-sext.ll
llvm/test/CodeGen/X86/vector-reduce-fmax.ll
llvm/test/CodeGen/X86/vector-reduce-fmin.ll
llvm/test/CodeGen/X86/vector-reduce-mul.ll
llvm/test/CodeGen/X86/vector-reduce-umax.ll
llvm/test/CodeGen/X86/vector-reduce-umin.ll
llvm/test/CodeGen/X86/vector-rotate-256.ll
llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll
llvm/test/CodeGen/X86/vector-shuffle-v192.ll
llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
llvm/test/CodeGen/X86/vector-trunc-math.ll
llvm/test/CodeGen/X86/vector-trunc-packus.ll
llvm/test/CodeGen/X86/vector-trunc-ssat.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
llvm/test/CodeGen/X86/vector-zext.ll
llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
llvm/test/CodeGen/X86/vselect-minmax.ll
llvm/test/CodeGen/X86/vselect-packss.ll
llvm/test/CodeGen/X86/x86-cmov-converter.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
llvm/test/CodeGen/X86/zext-sext.ll
llvm/test/CodeGen/X86/znver3-gather.ll
llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding.mir
llvm/test/DebugInfo/MIR/InstrRef/phi-coalescing.mir
llvm/test/DebugInfo/X86/live-debug-variables.ll
llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
llvm/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
llvm/test/tools/llvm-locstats/locstats.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 6dc51e37d3c2f..b5b151d3090e1 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -61,7 +61,6 @@ def CH : X86Reg<"ch", 5>;
def BH : X86Reg<"bh", 7>;
// X86-64 only, requires REX.
-let CostPerUse = [1] in {
def SIL : X86Reg<"sil", 6>;
def DIL : X86Reg<"dil", 7>;
def BPL : X86Reg<"bpl", 5>;
@@ -74,7 +73,6 @@ def R12B : X86Reg<"r12b", 12>;
def R13B : X86Reg<"r13b", 13>;
def R14B : X86Reg<"r14b", 14>;
def R15B : X86Reg<"r15b", 15>;
-}
let isArtificial = 1 in {
// High byte of the low 16 bits of the super-register:
@@ -126,8 +124,7 @@ def SP : X86Reg<"sp", 4, [SPL,SPH]>;
def IP : X86Reg<"ip", 0>;
// X86-64 only, requires REX.
-let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CostPerUse = [1],
- CoveredBySubRegs = 1 in {
+let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CoveredBySubRegs = 1 in {
def R8W : X86Reg<"r8w", 8, [R8B,R8BH]>;
def R9W : X86Reg<"r9w", 9, [R9B,R9BH]>;
def R10W : X86Reg<"r10w", 10, [R10B,R10BH]>;
@@ -152,8 +149,7 @@ def EIP : X86Reg<"eip", 0, [IP, HIP]>, DwarfRegNum<[-2, 8, 8]>;
}
// X86-64 only, requires REX
-let SubRegIndices = [sub_16bit, sub_16bit_hi], CostPerUse = [1],
- CoveredBySubRegs = 1 in {
+let SubRegIndices = [sub_16bit, sub_16bit_hi], CoveredBySubRegs = 1 in {
def R8D : X86Reg<"r8d", 8, [R8W,R8WH]>;
def R9D : X86Reg<"r9d", 9, [R9W,R9WH]>;
def R10D : X86Reg<"r10d", 10, [R10W,R10WH]>;
@@ -176,7 +172,6 @@ def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>;
def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
// These also require REX.
-let CostPerUse = [1] in {
def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>;
def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>;
def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
@@ -186,7 +181,7 @@ def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>;
def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>;
def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>;
def RIP : X86Reg<"rip", 0, [EIP]>, DwarfRegNum<[16, -2, -2]>;
-}}
+}
// MMX Registers. These are actually aliased to ST0 .. ST7
def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>;
@@ -219,7 +214,6 @@ def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>;
def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
// X86-64 only
-let CostPerUse = [1] in {
def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>;
def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>;
def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
@@ -246,8 +240,6 @@ def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[80, -2, -2]>;
def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[81, -2, -2]>;
def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[82, -2, -2]>;
-} // CostPerUse
-
// YMM0-15 registers, used by AVX instructions and
// YMM16-31 registers, used by AVX-512 instructions.
let SubRegIndices = [sub_xmm] in {
diff --git a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
index f187d503dfe71..6f9d13cde6b2e 100644
--- a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
+++ b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
@@ -14,43 +14,43 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
-; CHECK-NEXT: movslq (%rdi), %rdi
-; CHECK-NEXT: movslq (%rsi), %r8
-; CHECK-NEXT: movslq (%rdx), %r10
-; CHECK-NEXT: movl (%rcx), %esi
+; CHECK-NEXT: movslq (%rdi), %r8
+; CHECK-NEXT: movslq (%rsi), %rax
+; CHECK-NEXT: movslq (%rdx), %rsi
+; CHECK-NEXT: movl (%rcx), %edi
; CHECK-NEXT: movq %rsp, %rcx
-; CHECK-NEXT: subl %edi, %r8d
-; CHECK-NEXT: movslq %r8d, %rdx
+; CHECK-NEXT: subl %r8d, %eax
+; CHECK-NEXT: movslq %eax, %rdx
; CHECK-NEXT: js .LBB0_1
; CHECK-NEXT: # %bb.11: # %b63
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: js .LBB0_14
; CHECK-NEXT: # %bb.12:
-; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: xorl %r8d, %r8d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_13: # %a25b
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: testb %r8b, %r8b
; CHECK-NEXT: je .LBB0_13
; CHECK-NEXT: .LBB0_14: # %b85
-; CHECK-NEXT: movb $1, %al
-; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: movb $1, %r8b
+; CHECK-NEXT: testb %r8b, %r8b
; CHECK-NEXT: jne .LBB0_1
; CHECK-NEXT: # %bb.15:
-; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: xorl %r8d, %r8d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_16: # %a25b140
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: testb %r8b, %r8b
; CHECK-NEXT: je .LBB0_16
; CHECK-NEXT: .LBB0_1: # %a29b
-; CHECK-NEXT: cmpl %r10d, %esi
+; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: js .LBB0_10
; CHECK-NEXT: # %bb.2: # %b158
; CHECK-NEXT: movslq (%r9), %rsi
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: movb $1, %r9b
+; CHECK-NEXT: movb $1, %r8b
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_9: # %b1606
@@ -73,7 +73,7 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
; CHECK-NEXT: # Child Loop BB0_33 Depth 3
; CHECK-NEXT: # Child Loop BB0_34 Depth 2
; CHECK-NEXT: # Child Loop BB0_36 Depth 2
-; CHECK-NEXT: testl %r8d, %r8d
+; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: js .LBB0_4
; CHECK-NEXT: # %bb.17: # %b179
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
@@ -87,7 +87,7 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
; CHECK-NEXT: je .LBB0_37
; CHECK-NEXT: .LBB0_18: # %b188
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: testb %r9b, %r9b
+; CHECK-NEXT: testb %r8b, %r8b
; CHECK-NEXT: jne .LBB0_4
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_19: # %a30b294
@@ -97,23 +97,23 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
; CHECK-NEXT: je .LBB0_19
; CHECK-NEXT: .LBB0_4: # %a33b
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: movl %esi, %r10d
-; CHECK-NEXT: orl %r8d, %r10d
+; CHECK-NEXT: movl %esi, %r9d
+; CHECK-NEXT: orl %eax, %r9d
; CHECK-NEXT: jns .LBB0_20
; CHECK-NEXT: .LBB0_5: # %a50b
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: shrl $31, %r10d
-; CHECK-NEXT: movl %r8d, %eax
-; CHECK-NEXT: orl %esi, %eax
+; CHECK-NEXT: shrl $31, %r9d
+; CHECK-NEXT: movl %eax, %r10d
+; CHECK-NEXT: orl %esi, %r10d
; CHECK-NEXT: jns .LBB0_26
; CHECK-NEXT: .LBB0_6: # %a57b
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: shrl $31, %eax
-; CHECK-NEXT: testb %r10b, %r10b
+; CHECK-NEXT: shrl $31, %r10d
+; CHECK-NEXT: testb %r9b, %r9b
; CHECK-NEXT: je .LBB0_30
; CHECK-NEXT: .LBB0_7: # %a66b
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: testb %r10b, %r10b
; CHECK-NEXT: jne .LBB0_8
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_34: # %a74b
@@ -127,7 +127,7 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
; CHECK-NEXT: jne .LBB0_34
; CHECK-NEXT: .LBB0_8: # %a93b
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: testl %r8d, %r8d
+; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: js .LBB0_9
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_36: # %a97b
@@ -183,7 +183,7 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
; CHECK-NEXT: je .LBB0_38
; CHECK-NEXT: .LBB0_27: # %b879
; CHECK-NEXT: # in Loop: Header=BB0_26 Depth=2
-; CHECK-NEXT: testb %r9b, %r9b
+; CHECK-NEXT: testb %r8b, %r8b
; CHECK-NEXT: jne .LBB0_28
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_29: # %a53b1019
diff --git a/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll b/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll
index b0b8771a7d512..7bdc4e19a1cf6 100644
--- a/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll
+++ b/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll
@@ -28,23 +28,26 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: pushq %r12
; CHECK-NEXT: .cfi_def_cfa_offset 40
-; CHECK-NEXT: subq $40, %rsp
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: subq $32, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 80
-; CHECK-NEXT: .cfi_offset %rbx, -40
+; CHECK-NEXT: .cfi_offset %rbx, -48
+; CHECK-NEXT: .cfi_offset %r12, -40
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movq %rsi, %r14
-; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: movq %rdi, %r14
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: callq __ubyte_convert_to_ctype
; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: js LBB0_4
; CHECK-NEXT: ## %bb.1: ## %cond_next.i
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: movq %r14, %rdi
+; CHECK-NEXT: movq %rbx, %rdi
; CHECK-NEXT: callq __ubyte_convert_to_ctype
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: sarl $31, %ecx
@@ -66,18 +69,18 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: cmpl $-1, %eax
; CHECK-NEXT: je LBB0_3
; CHECK-NEXT: LBB0_6: ## %bb35
-; CHECK-NEXT: movq _PyUFunc_API at GOTPCREL(%rip), %rbp
-; CHECK-NEXT: movq (%rbp), %rax
+; CHECK-NEXT: movq _PyUFunc_API at GOTPCREL(%rip), %r14
+; CHECK-NEXT: movq (%r14), %rax
; CHECK-NEXT: callq *216(%rax)
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
; CHECK-NEXT: testb %dl, %dl
; CHECK-NEXT: je LBB0_11
; CHECK-NEXT: ## %bb.7: ## %cond_false.i
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-NEXT: movzbl %bl, %ecx
+; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: movzbl %sil, %ecx
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: divb %dl
-; CHECK-NEXT: movl %eax, %r14d
+; CHECK-NEXT: movl %eax, %r15d
; CHECK-NEXT: testb %cl, %cl
; CHECK-NEXT: jne LBB0_12
; CHECK-NEXT: jmp LBB0_14
@@ -91,26 +94,25 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: movq 80(%rax), %rax
; CHECK-NEXT: LBB0_10: ## %bb4
; CHECK-NEXT: movq 96(%rax), %rax
-; CHECK-NEXT: movq %rbx, %rdi
-; CHECK-NEXT: movq %r14, %rsi
+; CHECK-NEXT: movq %r14, %rdi
+; CHECK-NEXT: movq %rbx, %rsi
; CHECK-NEXT: callq *40(%rax)
; CHECK-NEXT: jmp LBB0_28
; CHECK-NEXT: LBB0_11: ## %cond_true.i
; CHECK-NEXT: movl $4, %edi
; CHECK-NEXT: callq _feraiseexcept
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-NEXT: xorl %r14d, %r14d
-; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: xorl %r15d, %r15d
+; CHECK-NEXT: testb %sil, %sil
; CHECK-NEXT: je LBB0_14
; CHECK-NEXT: LBB0_12: ## %cond_false.i
; CHECK-NEXT: testb %dl, %dl
; CHECK-NEXT: je LBB0_14
; CHECK-NEXT: ## %bb.13: ## %cond_next17.i
-; CHECK-NEXT: movzbl %bl, %eax
+; CHECK-NEXT: movzbl %sil, %eax
; CHECK-NEXT: divb %dl
-; CHECK-NEXT: movzbl %ah, %eax
-; CHECK-NEXT: movl %eax, %r15d
+; CHECK-NEXT: movzbl %ah, %ebx
; CHECK-NEXT: jmp LBB0_18
; CHECK-NEXT: LBB0_14: ## %cond_true.i200
; CHECK-NEXT: testb %dl, %dl
@@ -119,15 +121,15 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: movl $4, %edi
; CHECK-NEXT: callq _feraiseexcept
; CHECK-NEXT: LBB0_17: ## %ubyte_ctype_remainder.exit
-; CHECK-NEXT: xorl %r15d, %r15d
+; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: LBB0_18: ## %ubyte_ctype_remainder.exit
-; CHECK-NEXT: movq (%rbp), %rax
+; CHECK-NEXT: movq (%r14), %rax
; CHECK-NEXT: callq *224(%rax)
; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: je LBB0_21
; CHECK-NEXT: ## %bb.19: ## %cond_true61
-; CHECK-NEXT: movl %eax, %ebx
-; CHECK-NEXT: movq (%rbp), %rax
+; CHECK-NEXT: movl %eax, %ebp
+; CHECK-NEXT: movq (%r14), %rax
; CHECK-NEXT: movq _.str5 at GOTPCREL(%rip), %rdi
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
@@ -137,11 +139,11 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: js LBB0_27
; CHECK-NEXT: ## %bb.20: ## %cond_next73
; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq (%rbp), %rax
+; CHECK-NEXT: movq (%r14), %rax
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edi
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT: movl %ebx, %edx
+; CHECK-NEXT: movl %ebp, %edx
; CHECK-NEXT: callq *232(%rax)
; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: jne LBB0_27
@@ -151,40 +153,41 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: je LBB0_27
; CHECK-NEXT: ## %bb.22: ## %cond_next97
-; CHECK-NEXT: movq %rax, %rbx
-; CHECK-NEXT: movq _PyArray_API at GOTPCREL(%rip), %rbp
-; CHECK-NEXT: movq (%rbp), %rax
+; CHECK-NEXT: movq %rax, %r14
+; CHECK-NEXT: movq _PyArray_API at GOTPCREL(%rip), %r12
+; CHECK-NEXT: movq (%r12), %rax
; CHECK-NEXT: movq 200(%rax), %rdi
; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: callq *304(%rdi)
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: je LBB0_25
; CHECK-NEXT: ## %bb.23: ## %cond_next135
-; CHECK-NEXT: movb %r14b, 16(%rax)
-; CHECK-NEXT: movq %rax, 24(%rbx)
-; CHECK-NEXT: movq (%rbp), %rax
+; CHECK-NEXT: movb %r15b, 16(%rax)
+; CHECK-NEXT: movq %rax, 24(%r14)
+; CHECK-NEXT: movq (%r12), %rax
; CHECK-NEXT: movq 200(%rax), %rdi
; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: callq *304(%rdi)
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: je LBB0_25
; CHECK-NEXT: ## %bb.24: ## %cond_next182
-; CHECK-NEXT: movb %r15b, 16(%rax)
-; CHECK-NEXT: movq %rax, 32(%rbx)
-; CHECK-NEXT: movq %rbx, %rax
+; CHECK-NEXT: movb %bl, 16(%rax)
+; CHECK-NEXT: movq %rax, 32(%r14)
+; CHECK-NEXT: movq %r14, %rax
; CHECK-NEXT: jmp LBB0_28
; CHECK-NEXT: LBB0_25: ## %cond_true113
-; CHECK-NEXT: decq (%rbx)
+; CHECK-NEXT: decq (%r14)
; CHECK-NEXT: jne LBB0_27
; CHECK-NEXT: ## %bb.26: ## %cond_true126
-; CHECK-NEXT: movq 8(%rbx), %rax
-; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: movq 8(%r14), %rax
+; CHECK-NEXT: movq %r14, %rdi
; CHECK-NEXT: callq *48(%rax)
; CHECK-NEXT: LBB0_27: ## %UnifiedReturnBlock
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: LBB0_28: ## %UnifiedReturnBlock
-; CHECK-NEXT: addq $40, %rsp
+; CHECK-NEXT: addq $32, %rsp
; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
diff --git a/llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 7c782da4f47e4..f811b2f09ba85 100644
--- a/llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -16,16 +16,16 @@ define ptr @t(ptr %desc, i64 %p) nounwind ssp {
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: movq %rsi, %r14
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: orq $2097152, %r14 ## imm = 0x200000
-; CHECK-NEXT: andl $15728640, %r14d ## imm = 0xF00000
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: orq $2097152, %rbx ## imm = 0x200000
+; CHECK-NEXT: andl $15728640, %ebx ## imm = 0xF00000
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_1: ## %bb4
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: callq _xxGetOffsetForCode
-; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: movq %r14, %rdi
; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: callq _xxCalculateMidType
@@ -33,7 +33,7 @@ define ptr @t(ptr %desc, i64 %p) nounwind ssp {
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: ## %bb.2: ## %bb26
; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: cmpl $1048576, %r14d ## imm = 0x100000
+; CHECK-NEXT: cmpl $1048576, %ebx ## imm = 0x100000
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: ## %bb.3: ## %bb.i
; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1
diff --git a/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll b/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
index 5c3712e88cb49..d2cc1b3599cbc 100644
--- a/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
+++ b/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
@@ -10,226 +10,220 @@
define fastcc i64 @foo() nounwind {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: pushq %r13
-; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movq X(%rip), %rcx
; CHECK-NEXT: movq X(%rip), %r9
-; CHECK-NEXT: movq X(%rip), %r15
-; CHECK-NEXT: movq X(%rip), %rax
-; CHECK-NEXT: movq X(%rip), %rdx
-; CHECK-NEXT: movq X(%rip), %r12
-; CHECK-NEXT: movq X(%rip), %r14
-; CHECK-NEXT: movq X(%rip), %r11
+; CHECK-NEXT: movq X(%rip), %r8
; CHECK-NEXT: movq X(%rip), %rdi
-; CHECK-NEXT: addq %r12, %rdi
-; CHECK-NEXT: movq X(%rip), %rcx
+; CHECK-NEXT: movq X(%rip), %rsi
+; CHECK-NEXT: movq X(%rip), %rdx
; CHECK-NEXT: movq X(%rip), %rbx
-; CHECK-NEXT: bswapq %rcx
-; CHECK-NEXT: leaq (%r11,%r14), %rsi
-; CHECK-NEXT: addq %r12, %rsi
-; CHECK-NEXT: addq %rdi, %rsi
-; CHECK-NEXT: addq %rcx, %rsi
-; CHECK-NEXT: leaq (%r15,%r9), %r8
-; CHECK-NEXT: leaq (%r8,%rax), %r10
-; CHECK-NEXT: addq %rsi, %rdx
+; CHECK-NEXT: movq X(%rip), %rax
+; CHECK-NEXT: addq %rsi, %rax
+; CHECK-NEXT: movq X(%rip), %r10
+; CHECK-NEXT: movq X(%rip), %r11
+; CHECK-NEXT: bswapq %r10
+; CHECK-NEXT: leaq (%rbx,%rdx), %r14
+; CHECK-NEXT: addq %rsi, %r14
+; CHECK-NEXT: addq %rax, %r14
+; CHECK-NEXT: addq %r10, %r14
+; CHECK-NEXT: leaq (%r9,%rcx), %rax
+; CHECK-NEXT: leaq (%rax,%r8), %r10
+; CHECK-NEXT: addq %r14, %rdi
; CHECK-NEXT: addq %r10, %r10
+; CHECK-NEXT: bswapq %r11
+; CHECK-NEXT: addq %r14, %r10
+; CHECK-NEXT: addq %rbx, %r11
+; CHECK-NEXT: leaq (%rsi,%rdx), %rbx
+; CHECK-NEXT: addq %rdi, %rbx
+; CHECK-NEXT: addq %rbx, %r11
+; CHECK-NEXT: addq %rax, %rax
+; CHECK-NEXT: addq %r10, %rax
+; CHECK-NEXT: movq X(%rip), %rbx
+; CHECK-NEXT: addq %rdi, %r11
+; CHECK-NEXT: addq %r11, %r8
+; CHECK-NEXT: addq %r10, %rax
+; CHECK-NEXT: addq %r11, %rax
+; CHECK-NEXT: bswapq %rbx
+; CHECK-NEXT: leaq (%rdi,%rsi), %r11
+; CHECK-NEXT: addq %r8, %r11
+; CHECK-NEXT: addq %rdx, %rbx
+; CHECK-NEXT: addq %r11, %rbx
+; CHECK-NEXT: leaq (%r10,%rcx), %rdx
+; CHECK-NEXT: addq %rdx, %rdx
+; CHECK-NEXT: addq %rax, %rdx
+; CHECK-NEXT: movq X(%rip), %r11
+; CHECK-NEXT: addq %r8, %rbx
+; CHECK-NEXT: addq %rbx, %r9
+; CHECK-NEXT: addq %rax, %rdx
+; CHECK-NEXT: addq %rbx, %rdx
+; CHECK-NEXT: bswapq %r11
+; CHECK-NEXT: leaq (%r8,%rdi), %rbx
+; CHECK-NEXT: addq %r9, %rbx
+; CHECK-NEXT: addq %rsi, %r11
+; CHECK-NEXT: addq %rbx, %r11
+; CHECK-NEXT: leaq (%rax,%r10), %rsi
+; CHECK-NEXT: addq %rsi, %rsi
+; CHECK-NEXT: addq %rdx, %rsi
+; CHECK-NEXT: movq X(%rip), %rbx
+; CHECK-NEXT: addq %r9, %r11
+; CHECK-NEXT: addq %r11, %rcx
+; CHECK-NEXT: addq %rdx, %rsi
+; CHECK-NEXT: addq %r11, %rsi
; CHECK-NEXT: bswapq %rbx
-; CHECK-NEXT: addq %rsi, %r10
+; CHECK-NEXT: leaq (%r9,%r8), %r11
+; CHECK-NEXT: addq %rcx, %r11
+; CHECK-NEXT: addq %rdi, %rbx
; CHECK-NEXT: addq %r11, %rbx
-; CHECK-NEXT: leaq (%r12,%r14), %rcx
-; CHECK-NEXT: addq %rdx, %rcx
+; CHECK-NEXT: leaq (%rdx,%rax), %rdi
+; CHECK-NEXT: addq %rdi, %rdi
+; CHECK-NEXT: addq %rsi, %rdi
+; CHECK-NEXT: movq X(%rip), %r11
; CHECK-NEXT: addq %rcx, %rbx
+; CHECK-NEXT: addq %rbx, %r10
+; CHECK-NEXT: addq %rsi, %rdi
+; CHECK-NEXT: addq %rbx, %rdi
+; CHECK-NEXT: bswapq %r11
+; CHECK-NEXT: leaq (%rcx,%r9), %rbx
+; CHECK-NEXT: addq %r10, %rbx
+; CHECK-NEXT: addq %r8, %r11
+; CHECK-NEXT: addq %rbx, %r11
+; CHECK-NEXT: leaq (%rsi,%rdx), %r8
; CHECK-NEXT: addq %r8, %r8
-; CHECK-NEXT: addq %r10, %r8
-; CHECK-NEXT: movq X(%rip), %rcx
+; CHECK-NEXT: addq %rdi, %r8
+; CHECK-NEXT: movq X(%rip), %rbx
+; CHECK-NEXT: addq %r10, %r11
+; CHECK-NEXT: addq %r11, %rax
+; CHECK-NEXT: addq %rdi, %r8
+; CHECK-NEXT: addq %r11, %r8
+; CHECK-NEXT: bswapq %rbx
+; CHECK-NEXT: leaq (%r10,%rcx), %r11
+; CHECK-NEXT: addq %rax, %r11
+; CHECK-NEXT: addq %r9, %rbx
+; CHECK-NEXT: addq %r11, %rbx
+; CHECK-NEXT: leaq (%rdi,%rsi), %r9
+; CHECK-NEXT: addq %r9, %r9
+; CHECK-NEXT: addq %r8, %r9
+; CHECK-NEXT: movq X(%rip), %r11
+; CHECK-NEXT: addq %rax, %rbx
+; CHECK-NEXT: addq %rbx, %rdx
+; CHECK-NEXT: addq %r8, %r9
+; CHECK-NEXT: addq %rbx, %r9
+; CHECK-NEXT: bswapq %r11
+; CHECK-NEXT: leaq (%rax,%r10), %rbx
; CHECK-NEXT: addq %rdx, %rbx
-; CHECK-NEXT: addq %rbx, %rax
-; CHECK-NEXT: addq %r10, %r8
-; CHECK-NEXT: addq %rbx, %r8
-; CHECK-NEXT: bswapq %rcx
-; CHECK-NEXT: leaq (%rdx,%r12), %rsi
-; CHECK-NEXT: addq %rax, %rsi
-; CHECK-NEXT: addq %r14, %rcx
-; CHECK-NEXT: addq %rsi, %rcx
-; CHECK-NEXT: leaq (%r10,%r9), %rbx
-; CHECK-NEXT: addq %rbx, %rbx
-; CHECK-NEXT: addq %r8, %rbx
-; CHECK-NEXT: movq X(%rip), %rdi
-; CHECK-NEXT: addq %rax, %rcx
-; CHECK-NEXT: addq %rcx, %r15
-; CHECK-NEXT: addq %r8, %rbx
-; CHECK-NEXT: addq %rcx, %rbx
-; CHECK-NEXT: bswapq %rdi
-; CHECK-NEXT: leaq (%rax,%rdx), %rcx
-; CHECK-NEXT: addq %r15, %rcx
-; CHECK-NEXT: addq %r12, %rdi
-; CHECK-NEXT: addq %rcx, %rdi
-; CHECK-NEXT: leaq (%r8,%r10), %r12
-; CHECK-NEXT: addq %r12, %r12
-; CHECK-NEXT: addq %rbx, %r12
-; CHECK-NEXT: movq X(%rip), %rcx
-; CHECK-NEXT: addq %r15, %rdi
-; CHECK-NEXT: addq %rdi, %r9
-; CHECK-NEXT: addq %rbx, %r12
-; CHECK-NEXT: addq %rdi, %r12
-; CHECK-NEXT: bswapq %rcx
-; CHECK-NEXT: leaq (%r15,%rax), %rdi
-; CHECK-NEXT: addq %r9, %rdi
-; CHECK-NEXT: addq %rdx, %rcx
-; CHECK-NEXT: addq %rdi, %rcx
-; CHECK-NEXT: leaq (%rbx,%r8), %r13
-; CHECK-NEXT: addq %r13, %r13
-; CHECK-NEXT: addq %r12, %r13
-; CHECK-NEXT: movq X(%rip), %rdx
+; CHECK-NEXT: addq %rcx, %r11
+; CHECK-NEXT: addq %rbx, %r11
+; CHECK-NEXT: leaq (%r8,%rdi), %rcx
+; CHECK-NEXT: addq %rcx, %rcx
; CHECK-NEXT: addq %r9, %rcx
+; CHECK-NEXT: movq X(%rip), %rbx
+; CHECK-NEXT: addq %rdx, %r11
+; CHECK-NEXT: addq %r11, %rsi
+; CHECK-NEXT: addq %r9, %rcx
+; CHECK-NEXT: addq %r11, %rcx
+; CHECK-NEXT: bswapq %rbx
+; CHECK-NEXT: leaq (%rdx,%rax), %r11
+; CHECK-NEXT: addq %rsi, %r11
+; CHECK-NEXT: addq %r10, %rbx
+; CHECK-NEXT: addq %r11, %rbx
+; CHECK-NEXT: leaq (%r9,%r8), %r10
+; CHECK-NEXT: addq %r10, %r10
; CHECK-NEXT: addq %rcx, %r10
-; CHECK-NEXT: addq %r12, %r13
-; CHECK-NEXT: addq %rcx, %r13
-; CHECK-NEXT: bswapq %rdx
-; CHECK-NEXT: leaq (%r9,%r15), %rcx
-; CHECK-NEXT: addq %r10, %rcx
-; CHECK-NEXT: addq %rax, %rdx
-; CHECK-NEXT: addq %rcx, %rdx
-; CHECK-NEXT: leaq (%r12,%rbx), %r14
-; CHECK-NEXT: addq %r14, %r14
-; CHECK-NEXT: addq %r13, %r14
-; CHECK-NEXT: movq X(%rip), %rax
-; CHECK-NEXT: addq %r10, %rdx
-; CHECK-NEXT: addq %rdx, %r8
-; CHECK-NEXT: addq %r13, %r14
-; CHECK-NEXT: addq %rdx, %r14
-; CHECK-NEXT: bswapq %rax
-; CHECK-NEXT: leaq (%r10,%r9), %rcx
-; CHECK-NEXT: addq %r8, %rcx
-; CHECK-NEXT: addq %r15, %rax
-; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: leaq (%r13,%r12), %r11
+; CHECK-NEXT: movq X(%rip), %r14
+; CHECK-NEXT: addq %rsi, %rbx
+; CHECK-NEXT: addq %rbx, %rdi
+; CHECK-NEXT: addq %rcx, %r10
+; CHECK-NEXT: addq %rbx, %r10
+; CHECK-NEXT: bswapq %r14
+; CHECK-NEXT: leaq (%rsi,%rdx), %r11
+; CHECK-NEXT: addq %rdi, %r11
+; CHECK-NEXT: addq %rax, %r14
+; CHECK-NEXT: addq %r11, %r14
+; CHECK-NEXT: leaq (%rcx,%r9), %r11
; CHECK-NEXT: addq %r11, %r11
+; CHECK-NEXT: addq %r10, %r11
+; CHECK-NEXT: movq X(%rip), %rax
+; CHECK-NEXT: addq %rdi, %r14
+; CHECK-NEXT: addq %r14, %r8
+; CHECK-NEXT: addq %r10, %r11
; CHECK-NEXT: addq %r14, %r11
-; CHECK-NEXT: movq X(%rip), %rcx
+; CHECK-NEXT: bswapq %rax
+; CHECK-NEXT: leaq (%rdi,%rsi), %rbx
+; CHECK-NEXT: addq %r8, %rbx
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: addq %rbx, %rax
+; CHECK-NEXT: leaq (%r10,%rcx), %rdx
+; CHECK-NEXT: addq %rdx, %rdx
+; CHECK-NEXT: addq %r11, %rdx
+; CHECK-NEXT: movq X(%rip), %rbx
; CHECK-NEXT: addq %r8, %rax
+; CHECK-NEXT: addq %rax, %r9
+; CHECK-NEXT: addq %r11, %rdx
+; CHECK-NEXT: addq %rax, %rdx
+; CHECK-NEXT: bswapq %rbx
+; CHECK-NEXT: leaq (%r8,%rdi), %rax
+; CHECK-NEXT: addq %r9, %rax
+; CHECK-NEXT: addq %rsi, %rbx
; CHECK-NEXT: addq %rax, %rbx
-; CHECK-NEXT: addq %r14, %r11
-; CHECK-NEXT: addq %rax, %r11
-; CHECK-NEXT: bswapq %rcx
-; CHECK-NEXT: leaq (%r8,%r10), %rax
-; CHECK-NEXT: addq %rbx, %rax
-; CHECK-NEXT: addq %r9, %rcx
-; CHECK-NEXT: addq %rax, %rcx
-; CHECK-NEXT: leaq (%r14,%r13), %r9
-; CHECK-NEXT: addq %r9, %r9
-; CHECK-NEXT: addq %r11, %r9
-; CHECK-NEXT: movq X(%rip), %rax
+; CHECK-NEXT: leaq (%r11,%r10), %rax
+; CHECK-NEXT: addq %rax, %rax
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: movq X(%rip), %r14
+; CHECK-NEXT: addq %r9, %rbx
; CHECK-NEXT: addq %rbx, %rcx
-; CHECK-NEXT: addq %rcx, %r12
-; CHECK-NEXT: addq %r11, %r9
-; CHECK-NEXT: addq %rcx, %r9
-; CHECK-NEXT: bswapq %rax
-; CHECK-NEXT: leaq (%rbx,%r8), %rcx
-; CHECK-NEXT: addq %r12, %rcx
-; CHECK-NEXT: addq %r10, %rax
-; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: leaq (%r11,%r14), %r10
-; CHECK-NEXT: addq %r10, %r10
-; CHECK-NEXT: addq %r9, %r10
-; CHECK-NEXT: movq X(%rip), %rsi
-; CHECK-NEXT: addq %r12, %rax
-; CHECK-NEXT: addq %rax, %r13
-; CHECK-NEXT: addq %r9, %r10
-; CHECK-NEXT: addq %rax, %r10
-; CHECK-NEXT: bswapq %rsi
-; CHECK-NEXT: leaq (%r12,%rbx), %rax
-; CHECK-NEXT: addq %r13, %rax
-; CHECK-NEXT: addq %r8, %rsi
-; CHECK-NEXT: addq %rax, %rsi
-; CHECK-NEXT: leaq (%r9,%r11), %rdx
-; CHECK-NEXT: addq %rdx, %rdx
-; CHECK-NEXT: addq %r10, %rdx
-; CHECK-NEXT: movq X(%rip), %rax
-; CHECK-NEXT: addq %r13, %rsi
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: addq %rbx, %rax
+; CHECK-NEXT: bswapq %r14
+; CHECK-NEXT: leaq (%r9,%r8), %rsi
+; CHECK-NEXT: addq %rcx, %rsi
+; CHECK-NEXT: addq %rdi, %r14
; CHECK-NEXT: addq %rsi, %r14
-; CHECK-NEXT: addq %r10, %rdx
-; CHECK-NEXT: addq %rsi, %rdx
-; CHECK-NEXT: bswapq %rax
-; CHECK-NEXT: leaq (%r13,%r12), %rsi
+; CHECK-NEXT: leaq (%rdx,%r11), %rsi
+; CHECK-NEXT: addq %rsi, %rsi
+; CHECK-NEXT: addq %rax, %rsi
+; CHECK-NEXT: movq X(%rip), %rdi
+; CHECK-NEXT: addq %rcx, %r14
+; CHECK-NEXT: addq %r14, %r10
+; CHECK-NEXT: addq %rax, %rsi
; CHECK-NEXT: addq %r14, %rsi
-; CHECK-NEXT: addq %rbx, %rax
-; CHECK-NEXT: addq %rsi, %rax
-; CHECK-NEXT: leaq (%r10,%r9), %r8
+; CHECK-NEXT: bswapq %rdi
+; CHECK-NEXT: leaq (%rcx,%r9), %rbx
+; CHECK-NEXT: addq %r10, %rbx
+; CHECK-NEXT: addq %r8, %rdi
+; CHECK-NEXT: addq %rbx, %rdi
+; CHECK-NEXT: leaq (%rax,%rdx), %r8
; CHECK-NEXT: addq %r8, %r8
-; CHECK-NEXT: addq %rdx, %r8
-; CHECK-NEXT: movq X(%rip), %rsi
-; CHECK-NEXT: addq %r14, %rax
-; CHECK-NEXT: addq %rax, %r11
-; CHECK-NEXT: addq %rdx, %r8
-; CHECK-NEXT: addq %rax, %r8
-; CHECK-NEXT: bswapq %rsi
-; CHECK-NEXT: leaq (%r14,%r13), %rax
-; CHECK-NEXT: addq %r11, %rax
-; CHECK-NEXT: addq %r12, %rsi
-; CHECK-NEXT: addq %rax, %rsi
-; CHECK-NEXT: leaq (%rdx,%r10), %rax
-; CHECK-NEXT: addq %rax, %rax
-; CHECK-NEXT: addq %r8, %rax
+; CHECK-NEXT: addq %rsi, %r8
+; CHECK-NEXT: addq %r10, %rdi
+; CHECK-NEXT: addq %rdi, %r11
+; CHECK-NEXT: addq %rsi, %r8
+; CHECK-NEXT: addq %rdi, %r8
; CHECK-NEXT: movq X(%rip), %rdi
-; CHECK-NEXT: addq %r11, %rsi
-; CHECK-NEXT: addq %rsi, %r9
-; CHECK-NEXT: addq %r8, %rax
-; CHECK-NEXT: addq %rsi, %rax
; CHECK-NEXT: bswapq %rdi
-; CHECK-NEXT: leaq (%r11,%r14), %rsi
-; CHECK-NEXT: addq %r9, %rsi
-; CHECK-NEXT: addq %r13, %rdi
-; CHECK-NEXT: addq %rsi, %rdi
-; CHECK-NEXT: leaq (%r8,%rdx), %rsi
-; CHECK-NEXT: addq %rsi, %rsi
-; CHECK-NEXT: addq %rax, %rsi
-; CHECK-NEXT: movq X(%rip), %rcx
; CHECK-NEXT: addq %r9, %rdi
-; CHECK-NEXT: addq %rdi, %r10
-; CHECK-NEXT: addq %rax, %rsi
-; CHECK-NEXT: addq %rdi, %rsi
-; CHECK-NEXT: bswapq %rcx
-; CHECK-NEXT: leaq (%r9,%r11), %rdi
-; CHECK-NEXT: addq %r10, %rdi
-; CHECK-NEXT: addq %r14, %rcx
-; CHECK-NEXT: addq %rdi, %rcx
-; CHECK-NEXT: leaq (%rax,%r8), %rdi
-; CHECK-NEXT: addq %rdi, %rdi
-; CHECK-NEXT: addq %rsi, %rdi
-; CHECK-NEXT: addq %r10, %rcx
-; CHECK-NEXT: addq %rcx, %rdx
-; CHECK-NEXT: addq %rsi, %rdi
-; CHECK-NEXT: addq %rcx, %rdi
-; CHECK-NEXT: movq X(%rip), %rcx
-; CHECK-NEXT: bswapq %rcx
-; CHECK-NEXT: addq %r11, %rcx
-; CHECK-NEXT: leaq (%r10,%r9), %rbx
-; CHECK-NEXT: addq %rdx, %rbx
-; CHECK-NEXT: addq %rbx, %rcx
+; CHECK-NEXT: leaq (%r10,%rcx), %r9
+; CHECK-NEXT: addq %r11, %r9
+; CHECK-NEXT: addq %r9, %rdi
; CHECK-NEXT: addq %rax, %rsi
; CHECK-NEXT: addq %rsi, %rsi
+; CHECK-NEXT: addq %r8, %rsi
+; CHECK-NEXT: addq %r8, %rsi
+; CHECK-NEXT: addq %r11, %rdi
+; CHECK-NEXT: addq %rdi, %rdx
; CHECK-NEXT: addq %rdi, %rsi
-; CHECK-NEXT: addq %rdi, %rsi
-; CHECK-NEXT: addq %rdx, %rcx
-; CHECK-NEXT: addq %rcx, %r8
-; CHECK-NEXT: addq %rcx, %rsi
; CHECK-NEXT: movq X(%rip), %rax
; CHECK-NEXT: bswapq %rax
-; CHECK-NEXT: addq %r10, %rdx
+; CHECK-NEXT: addq %r10, %r11
; CHECK-NEXT: movq %rax, X(%rip)
-; CHECK-NEXT: addq %r9, %rax
-; CHECK-NEXT: addq %r8, %rdx
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: addq %rdx, %r11
+; CHECK-NEXT: addq %r11, %rax
; CHECK-NEXT: addq %rdx, %rax
-; CHECK-NEXT: addq %r8, %rax
; CHECK-NEXT: addq %rsi, %rax
; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: popq %r12
-; CHECK-NEXT: popq %r13
; CHECK-NEXT: popq %r14
-; CHECK-NEXT: popq %r15
; CHECK-NEXT: retq
%tmp = load volatile i64, ptr @X ; <i64> [#uses=7]
%tmp1 = load volatile i64, ptr @X ; <i64> [#uses=5]
diff --git a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
index e05f60179c8a9..fa743f26ba2d1 100644
--- a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
+++ b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
@@ -4,21 +4,21 @@
define void @BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i(ptr, i32 %c_nblock_used.2.i, i32 %.reload51, ptr %.out, ptr %.out1, ptr %.out2, ptr %.out3) nounwind {
; CHECK-LABEL: BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i:
; CHECK: # %bb.0: # %newFuncRoot
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movl %edx, %edx
; CHECK-NEXT: movl (%rdi,%rdx,4), %edx
-; CHECK-NEXT: movzbl %dl, %eax
-; CHECK-NEXT: addl $4, %eax
+; CHECK-NEXT: movzbl %dl, %r10d
+; CHECK-NEXT: addl $4, %r10d
; CHECK-NEXT: shrq $6, %rdx
; CHECK-NEXT: andl $67108860, %edx # imm = 0x3FFFFFC
; CHECK-NEXT: movl (%rdi,%rdx), %edx
; CHECK-NEXT: movzbl %dl, %edi
; CHECK-NEXT: shrl $8, %edx
; CHECK-NEXT: addl $5, %esi
-; CHECK-NEXT: movl %eax, (%rcx)
+; CHECK-NEXT: movl %r10d, (%rcx)
; CHECK-NEXT: movl %edi, (%r8)
; CHECK-NEXT: movl %edx, (%r9)
-; CHECK-NEXT: movl %esi, (%r10)
+; CHECK-NEXT: movl %esi, (%rax)
; CHECK-NEXT: retq
newFuncRoot:
br label %bb54.i
diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
index ee3649f2f3482..c67208dcf44fa 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
@@ -219,7 +219,7 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $1096, %rsp # imm = 0x448
-; CHECK-NEXT: movl %edi, %r14d
+; CHECK-NEXT: movl %edi, %ebx
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovups %zmm0, (%rsp)
; CHECK-NEXT: movb $1, (%rsp)
@@ -228,37 +228,37 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg (%rsp)
-; CHECK-NEXT: testl %r14d, %r14d
+; CHECK-NEXT: testl %ebx, %ebx
; CHECK-NEXT: jg .LBB2_4
; CHECK-NEXT: # %bb.1: # %.preheader
; CHECK-NEXT: movl $7, %ebp
-; CHECK-NEXT: movl $buf, %r15d
-; CHECK-NEXT: movl $32, %r12d
-; CHECK-NEXT: movw $8, %bx
+; CHECK-NEXT: movl $buf, %r14d
+; CHECK-NEXT: movl $32, %r15d
+; CHECK-NEXT: movw $8, %r12w
; CHECK-NEXT: movl $buf+2048, %r13d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0
+; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg (%rsp)
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
-; CHECK-NEXT: tilestored %tmm0, (%r13,%r12)
+; CHECK-NEXT: tilestored %tmm0, (%r13,%r15)
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg (%rsp)
; CHECK-NEXT: decl %ebp
; CHECK-NEXT: cmpl $7, %ebp
; CHECK-NEXT: jne .LBB2_2
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: cmpl $3, %r14d
+; CHECK-NEXT: cmpl $3, %ebx
; CHECK-NEXT: jne .LBB2_4
; CHECK-NEXT: # %bb.6:
; CHECK-NEXT: testl %ebp, %ebp
; CHECK-NEXT: jne .LBB2_5
; CHECK-NEXT: # %bb.7:
-; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: incl %ebx
; CHECK-NEXT: jmp .LBB2_8
; CHECK-NEXT: .LBB2_4:
; CHECK-NEXT: callq foo
@@ -269,9 +269,9 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0
; CHECK-NEXT: tilestored %tmm0, (%rcx,%rax)
; CHECK-NEXT: .LBB2_5:
-; CHECK-NEXT: decl %r14d
+; CHECK-NEXT: decl %ebx
; CHECK-NEXT: .LBB2_8:
-; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: movl %ebx, %eax
; CHECK-NEXT: addq $1096, %rsp # imm = 0x448
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
@@ -297,15 +297,15 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
; IPRA-NEXT: jg .LBB2_4
; IPRA-NEXT: # %bb.1: # %.preheader
; IPRA-NEXT: movl $7, %ecx
-; IPRA-NEXT: movl $buf, %r8d
+; IPRA-NEXT: movl $buf, %edx
; IPRA-NEXT: movl $32, %esi
; IPRA-NEXT: movw $8, %di
-; IPRA-NEXT: movl $buf+2048, %edx
+; IPRA-NEXT: movl $buf+2048, %r8d
; IPRA-NEXT: .p2align 4, 0x90
; IPRA-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1
-; IPRA-NEXT: tileloadd (%r8,%rsi), %tmm0
+; IPRA-NEXT: tileloadd (%rdx,%rsi), %tmm0
; IPRA-NEXT: callq foo
-; IPRA-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; IPRA-NEXT: tilestored %tmm0, (%r8,%rsi)
; IPRA-NEXT: callq foo
; IPRA-NEXT: decl %ecx
; IPRA-NEXT: cmpl $7, %ecx
@@ -485,14 +485,14 @@ define dso_local void @test_loop2(i32 %0) nounwind {
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $1088, %rsp # imm = 0x440
-; CHECK-NEXT: movl %edi, %r15d
+; CHECK-NEXT: movl %edi, %ebx
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovups %zmm0, (%rsp)
; CHECK-NEXT: movb $1, (%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $buf, %r14d
-; CHECK-NEXT: movl $32, %ebx
+; CHECK-NEXT: movl $32, %r15d
; CHECK-NEXT: movw $8, %bp
; CHECK-NEXT: movl $buf+2048, %r12d
; CHECK-NEXT: .p2align 4, 0x90
@@ -500,17 +500,17 @@ define dso_local void @test_loop2(i32 %0) nounwind {
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg (%rsp)
-; CHECK-NEXT: testl %r15d, %r15d
+; CHECK-NEXT: testl %ebx, %ebx
; CHECK-NEXT: jle .LBB3_3
; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1
-; CHECK-NEXT: tileloadd (%r14,%rbx), %tmm0
+; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg (%rsp)
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
-; CHECK-NEXT: tilestored %tmm0, (%r12,%rbx)
+; CHECK-NEXT: tilestored %tmm0, (%r12,%r15)
; CHECK-NEXT: callq foo
; CHECK-NEXT: jmp .LBB3_1
; CHECK-NEXT: .LBB3_3:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
index afadaff27fe24..f3b7591f34619 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
@@ -2,6 +2,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -stop-before virtregrewriter | FileCheck %s
define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row_from, i32 %c_row_to, i32 %c_row_tile, i32 %c_col_from, i32 %c_col_to, i32 %c_col_tile) {
+ ; Check LEA64_32r register is split to COPY10
; CHECK-LABEL: name: foo
; CHECK: bb.0.entry:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000)
@@ -56,6 +57,7 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: MOV64mr %stack.4, 1, $noreg, 0, $noreg, [[MOVSX64rr32_1]] :: (store (s64) into %stack.4)
; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = MOVSX64rr32 %84.sub_32bit
; CHECK-NEXT: [[MOVSX64rm32_:%[0-9]+]]:gr64_nosp = MOVSX64rm32 %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY [[COPY]]
; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = MOVSX64rr32 %88.sub_32bit
; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
; CHECK-NEXT: [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]]
@@ -63,13 +65,13 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]]
; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8)
; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6)
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64_nosp = IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_2]], implicit-def dead $eflags
; CHECK-NEXT: [[ADD64rr:%[0-9]+]]:gr64_nosp = ADD64rr [[ADD64rr]], [[MOVSX64rm32_]], implicit-def dead $eflags
- ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[ADD64rr]], 0, $noreg
+ ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY2]], 4, [[ADD64rr]], 0, $noreg
; CHECK-NEXT: MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9)
; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7)
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]]
; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64 = IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_2]], implicit-def dead $eflags
; CHECK-NEXT: [[SHL64ri:%[0-9]+]]:gr64 = SHL64ri [[SHL64ri]], 2, implicit-def dead $eflags
; CHECK-NEXT: MOV64mr %stack.10, 1, $noreg, 0, $noreg, [[SHL64ri]] :: (store (s64) into %stack.10)
@@ -77,28 +79,29 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_]] :: (store (s64) into %stack.5)
; CHECK-NEXT: [[LEA64_32r2:%[0-9]+]]:gr32 = LEA64_32r %61, 4, [[MOVSX64rm32_]], 0, $noreg
; CHECK-NEXT: MOV32mr %stack.11, 1, $noreg, 0, $noreg, [[LEA64_32r2]] :: (store (s32) into %stack.11)
- ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[LEA64r1]] :: (store (s64) into %stack.13)
+ ; CHECK-NEXT: MOV64mr %stack.12, 1, $noreg, 0, $noreg, [[LEA64r1]] :: (store (s64) into %stack.12)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.for.cond14.preheader:
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[MOV32rm3:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
; CHECK-NEXT: CMP32rm [[MOV32rm3]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16)
+ ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
; CHECK-NEXT: JCC_1 %bb.5, 13, implicit killed $eflags
; CHECK-NEXT: JMP_1 %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3.for.body17.lr.ph:
; CHECK-NEXT: successors: %bb.6(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
+ ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64 = nsw IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_]], implicit-def dead $eflags
; CHECK-NEXT: [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm]], %stack.3, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.3)
; CHECK-NEXT: [[ADD64rm1:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm1]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1)
- ; CHECK-NEXT: MOV64mr %stack.12, 1, $noreg, 0, $noreg, [[ADD64rm1]] :: (store (s64) into %stack.12)
+ ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[ADD64rm1]] :: (store (s64) into %stack.13)
; CHECK-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %stack.11, 1, $noreg, 0, $noreg :: (load (s32) from %stack.11)
; CHECK-NEXT: undef %68.sub_32bit:gr64_nosp = COPY [[MOV32rm4]]
- ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9)
- ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+ ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9)
+ ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
; CHECK-NEXT: JMP_1 %bb.6
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4.for.cond.cleanup:
@@ -107,10 +110,10 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: bb.5.for.cond.cleanup16:
; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
+ ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
; CHECK-NEXT: [[ADD64rm1:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm1]], %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.7)
- ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10)
- ; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm2]], implicit-def dead $eflags :: (store (s64) into %stack.9)
+ ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10)
+ ; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm3]], implicit-def dead $eflags :: (store (s64) into %stack.9)
; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[ADD64rm1]] :: (store (s64) into %stack.6)
; CHECK-NEXT: CMP64rm [[ADD64rm1]], %stack.8, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %stack.8)
; CHECK-NEXT: JCC_1 %bb.2, 12, implicit killed $eflags
@@ -120,39 +123,39 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV32rm2]].sub_16bit, %88.sub_16bit
- ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load (s64) from %stack.12)
- ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm2]].sub_16bit, [[SUB32rr]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_]], 0, $noreg
+ ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13)
+ ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm2]].sub_16bit, [[SUB32rr]].sub_16bit, [[MOV64rm4]], 1, [[MOVSX64rr32_]], 0, $noreg
; CHECK-NEXT: [[MOVSX64rr32_7:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[MOVSX64rr32_7]].sub_32bit
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY %88
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]]
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr32 = COPY [[MOV32rm2]]
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr32 = COPY [[SUB32rr]]
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY [[COPY1]]
- ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY9]], 1, [[MOVSX64rr32_7]], 0, $noreg
- ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64_nosp = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
- ; Check LEA64_32r register is split to COPY10
- ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr32 = COPY [[SUB32rr]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr32 = COPY [[MOV32rm2]]
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]]
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY %88
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr64 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY10]], 1, [[MOVSX64rr32_7]], 0, $noreg
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr32 = COPY [[LEA64_32r]]
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]]
; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
- ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm4]], 0, $noreg
- ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr32 = COPY [[COPY10]]
- ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr64 = COPY [[COPY9]]
- ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY8]]
- ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13)
- ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY7]]
- ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[COPY6]]
- ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64_nosp = COPY [[COPY5]]
- ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64_nosp = COPY [[COPY4]]
- ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64_nosp = COPY [[COPY3]]
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64_nosp = COPY [[COPY2]]
- ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY13]].sub_16bit, [[COPY11]].sub_16bit, [[COPY12]].sub_16bit, [[PTDPBSSDV]], [[PTILELOADDV]], [[PTILELOADDV1]]
- ; CHECK-NEXT: PTILESTOREDV [[COPY13]].sub_16bit, [[COPY18]].sub_16bit, [[MOV64rm]], 1, [[COPY16]], 0, $noreg, [[PTDPBSSDV]]
- ; CHECK-NEXT: [[ADD64rr1:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr1]], [[COPY15]], implicit-def dead $eflags
- ; CHECK-NEXT: [[ADD64rr2:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr2]], [[MOV64rm5]], implicit-def dead $eflags
- ; CHECK-NEXT: [[MOVSX64rr32_7]].sub_32bit:gr64_nosp = ADD32rr [[MOVSX64rr32_7]].sub_32bit, [[COPY11]], implicit-def dead $eflags
- ; CHECK-NEXT: CMP64rr [[ADD64rr1]], [[COPY14]], implicit-def $eflags
+ ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[COPY11]].sub_16bit, [[LEA64r2]], 1, [[COPY12]], 0, $noreg
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr64 = COPY [[COPY12]]
+ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr32 = COPY [[COPY11]]
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[COPY10]]
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64_nosp = COPY [[COPY9]]
+ ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64_nosp = COPY [[COPY8]]
+ ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64_nosp = COPY [[COPY7]]
+ ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64_nosp = COPY [[COPY6]]
+ ; CHECK-NEXT: [[COPY19:%[0-9]+]]:gr64 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:gr32 = COPY [[COPY4]]
+ ; CHECK-NEXT: [[COPY21:%[0-9]+]]:gr32 = COPY [[COPY3]]
+ ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load (s64) from %stack.12)
+ ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY20]].sub_16bit, [[COPY14]].sub_16bit, [[COPY21]].sub_16bit, [[PTDPBSSDV]], [[PTILELOADDV]], [[PTILELOADDV1]]
+ ; CHECK-NEXT: PTILESTOREDV [[COPY20]].sub_16bit, [[COPY15]].sub_16bit, [[MOV64rm1]], 1, [[COPY17]], 0, $noreg, [[PTDPBSSDV]]
+ ; CHECK-NEXT: [[ADD64rr1:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr1]], [[COPY18]], implicit-def dead $eflags
+ ; CHECK-NEXT: [[ADD64rr2:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr2]], [[MOV64rm4]], implicit-def dead $eflags
+ ; CHECK-NEXT: [[MOVSX64rr32_7]].sub_32bit:gr64_nosp = ADD32rr [[MOVSX64rr32_7]].sub_32bit, [[COPY14]], implicit-def dead $eflags
+ ; CHECK-NEXT: CMP64rr [[ADD64rr1]], [[COPY19]], implicit-def $eflags
; CHECK-NEXT: JCC_1 %bb.6, 12, implicit killed $eflags
; CHECK-NEXT: JMP_1 %bb.5
entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll b/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll
index e737277ec1278..46b5f62456cdc 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll
@@ -18,20 +18,20 @@ define dso_local void @test_chain(ptr %A_mem, ptr %B_mem, ptr %C_mem) {
; CHECK-NEXT: movb $16, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $64, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movl $64, %r8d
+; CHECK-NEXT: movl $64, %eax
; CHECK-NEXT: movw $64, %cx
-; CHECK-NEXT: movw $16, %ax
-; CHECK-NEXT: tileloadd (%rdi,%r8), %tmm0
+; CHECK-NEXT: movw $16, %r8w
+; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm0
; CHECK-NEXT: addq $1024, %rdi # imm = 0x400
-; CHECK-NEXT: tileloadd (%rdi,%r8), %tmm1
-; CHECK-NEXT: tileloadd (%rdx,%r8), %tmm3
+; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm1
+; CHECK-NEXT: tileloadd (%rdx,%rax), %tmm3
; CHECK-NEXT: leaq 1024(%rdx), %rdi
-; CHECK-NEXT: tileloadd (%rdi,%r8), %tmm2
-; CHECK-NEXT: tileloadd (%rsi,%r8), %tmm4
+; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm2
+; CHECK-NEXT: tileloadd (%rsi,%rax), %tmm4
; CHECK-NEXT: tdpbssd %tmm4, %tmm0, %tmm3
-; CHECK-NEXT: tilestored %tmm3, (%rdx,%r8)
+; CHECK-NEXT: tilestored %tmm3, (%rdx,%rax)
; CHECK-NEXT: tdpbssd %tmm4, %tmm1, %tmm2
-; CHECK-NEXT: tilestored %tmm2, (%rdi,%r8)
+; CHECK-NEXT: tilestored %tmm2, (%rdi,%rax)
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
index 479c859ec88a8..714590d3c156b 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
@@ -237,15 +237,15 @@ define dso_local void @test5(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movl $buf, %r8d
+; CHECK-NEXT: movl $buf, %ecx
; CHECK-NEXT: movl $32, %edx
-; CHECK-NEXT: leal -1(%rsi), %ecx
+; CHECK-NEXT: leal -1(%rsi), %r8d
; CHECK-NEXT: jmp .LBB4_1
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB4_3: # %if.false
; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1
-; CHECK-NEXT: movl %ecx, %esi
-; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r8d, %esi
+; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: cmpw $7, %si
; CHECK-NEXT: jne .LBB4_5
; CHECK-NEXT: .LBB4_1: # %loop.bb1
@@ -256,7 +256,7 @@ define dso_local void @test5(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: # %bb.2: # %if.true
; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1
; CHECK-NEXT: tilezero %tmm0
-; CHECK-NEXT: tilestored %tmm0, (%r8,%rdx)
+; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx)
; CHECK-NEXT: cmpw $7, %si
; CHECK-NEXT: je .LBB4_1
; CHECK-NEXT: .LBB4_5: # %exit
@@ -296,7 +296,7 @@ define dso_local void @test6(i16 signext %0) nounwind {
; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: xorl %r8d, %r8d
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movl $buf, %ecx
; CHECK-NEXT: movl $32, %edx
; CHECK-NEXT: xorl %esi, %esi
@@ -307,8 +307,8 @@ define dso_local void @test6(i16 signext %0) nounwind {
; CHECK-NEXT: decl %esi
; CHECK-NEXT: .LBB5_4: # %loop.bb2
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: leal (%rdi,%rsi), %eax
-; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: leal (%rdi,%rsi), %r8d
+; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: cmpw $7, %si
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT: tilezero %tmm0
@@ -316,7 +316,7 @@ define dso_local void @test6(i16 signext %0) nounwind {
; CHECK-NEXT: jne .LBB5_5
; CHECK-NEXT: .LBB5_1: # %loop.bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: testb %r8b, %r8b
+; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB5_3
; CHECK-NEXT: # %bb.2: # %if.true
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
diff --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
index 464eae6182f96..e3c6f039cf0be 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
@@ -22,14 +22,14 @@ define dso_local void @test1(ptr%buf) nounwind {
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $64, %eax
-; CHECK-NEXT: movw $8, %r14w
+; CHECK-NEXT: movw $8, %bp
; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm3
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB0_3
; CHECK-NEXT: # %bb.1: # %loop.header.preheader
; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: xorl %ebp, %ebp
+; CHECK-NEXT: xorl %r14d, %r14d
; CHECK-NEXT: movl $32, %r15d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %loop.header
@@ -51,8 +51,8 @@ define dso_local void @test1(ptr%buf) nounwind {
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15)
-; CHECK-NEXT: incl %ebp
-; CHECK-NEXT: cmpw $100, %bp
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpw $100, %r14w
; CHECK-NEXT: jl .LBB0_2
; CHECK-NEXT: .LBB0_3: # %exit
; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8
@@ -112,8 +112,8 @@ define dso_local void @test2(ptr%buf) nounwind {
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB1_3
; CHECK-NEXT: # %bb.1: # %loop.header.preheader
-; CHECK-NEXT: movq %rdi, %r14
-; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: xorl %r14d, %r14d
; CHECK-NEXT: movl $32, %r15d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_2: # %loop.header
@@ -123,12 +123,12 @@ define dso_local void @test2(ptr%buf) nounwind {
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: tilezero %tmm2
-; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0
-; CHECK-NEXT: tileloadd (%r14,%r15), %tmm1
+; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0
+; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: tilestored %tmm2, (%r14,%r15)
-; CHECK-NEXT: incl %ebx
-; CHECK-NEXT: cmpw $100, %bx
+; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15)
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpw $100, %r14w
; CHECK-NEXT: jl .LBB1_2
; CHECK-NEXT: .LBB1_3: # %exit
; CHECK-NEXT: addq $72, %rsp
diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
index 7f82209d71732..e4a2279f4675f 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
@@ -137,26 +137,26 @@ define dso_local void @test3(ptr%buf) nounwind {
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB1_3
; CHECK-NEXT: # %bb.1: # %loop.header.preheader
-; CHECK-NEXT: movq %rdi, %r15
+; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: movl $32, %r14d
-; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: xorl %r15d, %r15d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_2: # %loop.header
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: tilestored %tmm0, (%r15,%r14)
+; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14)
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: tilezero %tmm0
-; CHECK-NEXT: tileloadd (%r15,%r14), %tmm1
-; CHECK-NEXT: tileloadd (%r15,%r14), %tmm2
+; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1
+; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
-; CHECK-NEXT: tilestored %tmm0, (%r15,%r14)
+; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14)
; CHECK-NEXT: tilezero %tmm0
-; CHECK-NEXT: incl %ebx
-; CHECK-NEXT: cmpw $100, %bx
+; CHECK-NEXT: incl %r15d
+; CHECK-NEXT: cmpw $100, %r15w
; CHECK-NEXT: jl .LBB1_2
; CHECK-NEXT: .LBB1_3: # %exit
; CHECK-NEXT: addq $72, %rsp
diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill.ll b/llvm/test/CodeGen/X86/AMX/amx-spill.ll
index efd8935230431..a04715bd61322 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-spill.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-spill.ll
@@ -28,30 +28,25 @@ define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) nounwind
; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movl $buf, %r8d
+; CHECK-NEXT: movl $buf, %ecx
; CHECK-NEXT: movl $32, %eax
-; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
-; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
-; CHECK-NEXT: movabsq $64, %rcx
-; CHECK-NEXT: tilestored %tmm1, -64(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: tileloadd (%r8,%rax), %tmm3
-; CHECK-NEXT: tileloadd (%r8,%rax), %tmm4
-; CHECK-NEXT: tileloadd (%r8,%rax), %tmm2
-; CHECK-NEXT: tileloadd (%r8,%rax), %tmm5
-; CHECK-NEXT: tileloadd (%r8,%rax), %tmm0
+; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1
+; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1
+; CHECK-NEXT: movabsq $64, %r8
+; CHECK-NEXT: tilestored %tmm1, -64(%rsp,%r8) # 1024-byte Folded Spill
+; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm3
+; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm4
+; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm2
+; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm5
+; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0
; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: je .LBB0_2
+; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: tileloadd (%r8,%rax), %tmm6
-; CHECK-NEXT: tileloadd (%r8,%rax), %tmm7
-; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
-; CHECK-NEXT: jmp .LBB0_3
-; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: movl $buf2, %ecx
+; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm6
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm7
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1
-; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: tdpbssd %tmm7, %tmm6, %tmm1
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd -64(%rsp,%rax), %tmm7 # 1024-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
index 106a4d75418e8..565d3588710e3 100644
--- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -452,30 +452,30 @@ block4: ; preds = %4, %.lr.ph
define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
; BWON-LABEL: MergeLoadStoreBaseIndexOffset:
; BWON: # %bb.0:
-; BWON-NEXT: movl %ecx, %r8d
+; BWON-NEXT: movl %ecx, %eax
; BWON-NEXT: xorl %ecx, %ecx
; BWON-NEXT: .p2align 4, 0x90
; BWON-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
-; BWON-NEXT: movq (%rdi,%rcx,8), %rax
-; BWON-NEXT: movzwl (%rdx,%rax), %eax
-; BWON-NEXT: movw %ax, (%rsi,%rcx,2)
+; BWON-NEXT: movq (%rdi,%rcx,8), %r8
+; BWON-NEXT: movzwl (%rdx,%r8), %r8d
+; BWON-NEXT: movw %r8w, (%rsi,%rcx,2)
; BWON-NEXT: incq %rcx
-; BWON-NEXT: cmpl %ecx, %r8d
+; BWON-NEXT: cmpl %ecx, %eax
; BWON-NEXT: jne .LBB9_1
; BWON-NEXT: # %bb.2:
; BWON-NEXT: retq
;
; BWOFF-LABEL: MergeLoadStoreBaseIndexOffset:
; BWOFF: # %bb.0:
-; BWOFF-NEXT: movl %ecx, %r8d
+; BWOFF-NEXT: movl %ecx, %eax
; BWOFF-NEXT: xorl %ecx, %ecx
; BWOFF-NEXT: .p2align 4, 0x90
; BWOFF-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
-; BWOFF-NEXT: movq (%rdi,%rcx,8), %rax
-; BWOFF-NEXT: movw (%rdx,%rax), %ax
-; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2)
+; BWOFF-NEXT: movq (%rdi,%rcx,8), %r8
+; BWOFF-NEXT: movw (%rdx,%r8), %r8w
+; BWOFF-NEXT: movw %r8w, (%rsi,%rcx,2)
; BWOFF-NEXT: incq %rcx
-; BWOFF-NEXT: cmpl %ecx, %r8d
+; BWOFF-NEXT: cmpl %ecx, %eax
; BWOFF-NEXT: jne .LBB9_1
; BWOFF-NEXT: # %bb.2:
; BWOFF-NEXT: retq
@@ -509,30 +509,30 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) {
; BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
; BWON: # %bb.0:
-; BWON-NEXT: xorl %r8d, %r8d
+; BWON-NEXT: xorl %eax, %eax
; BWON-NEXT: .p2align 4, 0x90
; BWON-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
-; BWON-NEXT: movsbq (%rsi), %rax
-; BWON-NEXT: movzwl (%rdx,%rax), %eax
-; BWON-NEXT: movw %ax, (%rdi,%r8)
+; BWON-NEXT: movsbq (%rsi), %r8
+; BWON-NEXT: movzwl (%rdx,%r8), %r8d
+; BWON-NEXT: movw %r8w, (%rdi,%rax)
; BWON-NEXT: incq %rsi
-; BWON-NEXT: addq $2, %r8
-; BWON-NEXT: cmpq %rcx, %r8
+; BWON-NEXT: addq $2, %rax
+; BWON-NEXT: cmpq %rcx, %rax
; BWON-NEXT: jl .LBB10_1
; BWON-NEXT: # %bb.2:
; BWON-NEXT: retq
;
; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
; BWOFF: # %bb.0:
-; BWOFF-NEXT: xorl %r8d, %r8d
+; BWOFF-NEXT: xorl %eax, %eax
; BWOFF-NEXT: .p2align 4, 0x90
; BWOFF-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
-; BWOFF-NEXT: movsbq (%rsi), %rax
-; BWOFF-NEXT: movw (%rdx,%rax), %ax
-; BWOFF-NEXT: movw %ax, (%rdi,%r8)
+; BWOFF-NEXT: movsbq (%rsi), %r8
+; BWOFF-NEXT: movw (%rdx,%r8), %r8w
+; BWOFF-NEXT: movw %r8w, (%rdi,%rax)
; BWOFF-NEXT: incq %rsi
-; BWOFF-NEXT: addq $2, %r8
-; BWOFF-NEXT: cmpq %rcx, %r8
+; BWOFF-NEXT: addq $2, %rax
+; BWOFF-NEXT: cmpq %rcx, %rax
; BWOFF-NEXT: jl .LBB10_1
; BWOFF-NEXT: # %bb.2:
; BWOFF-NEXT: retq
@@ -568,30 +568,30 @@ define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i6
define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
; BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext:
; BWON: # %bb.0:
-; BWON-NEXT: movl %ecx, %r8d
+; BWON-NEXT: movl %ecx, %eax
; BWON-NEXT: xorl %ecx, %ecx
; BWON-NEXT: .p2align 4, 0x90
; BWON-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
-; BWON-NEXT: movsbq (%rdi,%rcx), %rax
-; BWON-NEXT: movzwl (%rdx,%rax), %eax
-; BWON-NEXT: movw %ax, (%rsi,%rcx,2)
+; BWON-NEXT: movsbq (%rdi,%rcx), %r8
+; BWON-NEXT: movzwl (%rdx,%r8), %r8d
+; BWON-NEXT: movw %r8w, (%rsi,%rcx,2)
; BWON-NEXT: incq %rcx
-; BWON-NEXT: cmpl %ecx, %r8d
+; BWON-NEXT: cmpl %ecx, %eax
; BWON-NEXT: jne .LBB11_1
; BWON-NEXT: # %bb.2:
; BWON-NEXT: retq
;
; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext:
; BWOFF: # %bb.0:
-; BWOFF-NEXT: movl %ecx, %r8d
+; BWOFF-NEXT: movl %ecx, %eax
; BWOFF-NEXT: xorl %ecx, %ecx
; BWOFF-NEXT: .p2align 4, 0x90
; BWOFF-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
-; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax
-; BWOFF-NEXT: movw (%rdx,%rax), %ax
-; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2)
+; BWOFF-NEXT: movsbq (%rdi,%rcx), %r8
+; BWOFF-NEXT: movw (%rdx,%r8), %r8w
+; BWOFF-NEXT: movw %r8w, (%rsi,%rcx,2)
; BWOFF-NEXT: incq %rcx
-; BWOFF-NEXT: cmpl %ecx, %r8d
+; BWOFF-NEXT: cmpl %ecx, %eax
; BWOFF-NEXT: jne .LBB11_1
; BWOFF-NEXT: # %bb.2:
; BWOFF-NEXT: retq
@@ -626,38 +626,38 @@ define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
; BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex:
; BWON: # %bb.0:
-; BWON-NEXT: movl %ecx, %r8d
+; BWON-NEXT: movl %ecx, %eax
; BWON-NEXT: xorl %ecx, %ecx
; BWON-NEXT: .p2align 4, 0x90
; BWON-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
-; BWON-NEXT: movsbq (%rdi,%rcx), %rax
-; BWON-NEXT: movzbl (%rdx,%rax), %r9d
-; BWON-NEXT: incl %eax
-; BWON-NEXT: movsbq %al, %rax
-; BWON-NEXT: movzbl (%rdx,%rax), %eax
+; BWON-NEXT: movsbq (%rdi,%rcx), %r8
+; BWON-NEXT: movzbl (%rdx,%r8), %r9d
+; BWON-NEXT: incl %r8d
+; BWON-NEXT: movsbq %r8b, %r8
+; BWON-NEXT: movzbl (%rdx,%r8), %r8d
; BWON-NEXT: movb %r9b, (%rsi,%rcx,2)
-; BWON-NEXT: movb %al, 1(%rsi,%rcx,2)
+; BWON-NEXT: movb %r8b, 1(%rsi,%rcx,2)
; BWON-NEXT: incq %rcx
-; BWON-NEXT: cmpl %ecx, %r8d
+; BWON-NEXT: cmpl %ecx, %eax
; BWON-NEXT: jne .LBB12_1
; BWON-NEXT: # %bb.2:
; BWON-NEXT: retq
;
; BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex:
; BWOFF: # %bb.0:
-; BWOFF-NEXT: movl %ecx, %r8d
+; BWOFF-NEXT: movl %ecx, %eax
; BWOFF-NEXT: xorl %ecx, %ecx
; BWOFF-NEXT: .p2align 4, 0x90
; BWOFF-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
-; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax
-; BWOFF-NEXT: movb (%rdx,%rax), %r9b
-; BWOFF-NEXT: incl %eax
-; BWOFF-NEXT: movsbq %al, %rax
-; BWOFF-NEXT: movb (%rdx,%rax), %al
+; BWOFF-NEXT: movsbq (%rdi,%rcx), %r8
+; BWOFF-NEXT: movb (%rdx,%r8), %r9b
+; BWOFF-NEXT: incl %r8d
+; BWOFF-NEXT: movsbq %r8b, %r8
+; BWOFF-NEXT: movb (%rdx,%r8), %r8b
; BWOFF-NEXT: movb %r9b, (%rsi,%rcx,2)
-; BWOFF-NEXT: movb %al, 1(%rsi,%rcx,2)
+; BWOFF-NEXT: movb %r8b, 1(%rsi,%rcx,2)
; BWOFF-NEXT: incq %rcx
-; BWOFF-NEXT: cmpl %ecx, %r8d
+; BWOFF-NEXT: cmpl %ecx, %eax
; BWOFF-NEXT: jne .LBB12_1
; BWOFF-NEXT: # %bb.2:
; BWOFF-NEXT: retq
@@ -921,11 +921,12 @@ define void @merge_heterogeneous(%struct.C* nocapture %p, %struct.C* nocapture %
}
define i32 @merge_store_load_store_seq(i32* %buff) {
-entry:
; CHECK-LABEL: merge_store_load_store_seq:
-; CHECK: movl 4(%rdi), %eax
-; CHECK-NEXT: movq $0, (%rdi)
-; CHECK-NEXT: retq
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl 4(%rdi), %eax
+; CHECK-NEXT: movq $0, (%rdi)
+; CHECK-NEXT: retq
+entry:
store i32 0, i32* %buff, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %buff, i64 1
@@ -935,12 +936,13 @@ entry:
}
define i32 @merge_store_alias(i32* %buff, i32* %other) {
-entry:
; CHECK-LABEL: merge_store_alias:
-; CHECK: movl $0, (%rdi)
-; CHECK-NEXT: movl (%rsi), %eax
-; CHECK-NEXT: movl $0, 4(%rdi)
-; CHECK-NEXT: retq
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $0, (%rdi)
+; CHECK-NEXT: movl (%rsi), %eax
+; CHECK-NEXT: movl $0, 4(%rdi)
+; CHECK-NEXT: retq
+entry:
store i32 0, i32* %buff, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %buff, i64 1
diff --git a/llvm/test/CodeGen/X86/StackColoring.ll b/llvm/test/CodeGen/X86/StackColoring.ll
index 248597ca81359..389d024dafd11 100644
--- a/llvm/test/CodeGen/X86/StackColoring.ll
+++ b/llvm/test/CodeGen/X86/StackColoring.ll
@@ -82,9 +82,9 @@ bb3:
}
;CHECK-LABEL: myCall_w4:
-;YESCOLOR: subq $120, %rsp
-;NOFIRSTUSE: subq $200, %rsp
-;NOCOLOR: subq $408, %rsp
+;YESCOLOR: subq $112, %rsp
+;NOFIRSTUSE: subq $208, %rsp
+;NOCOLOR: subq $400, %rsp
define i32 @myCall_w4(i32 %in) {
entry:
diff --git a/llvm/test/CodeGen/X86/add-and-not.ll b/llvm/test/CodeGen/X86/add-and-not.ll
index 9565a90be134c..99bfb94e689e2 100644
--- a/llvm/test/CodeGen/X86/add-and-not.ll
+++ b/llvm/test/CodeGen/X86/add-and-not.ll
@@ -92,16 +92,16 @@ define i8 @add_and_xor_extra_use(i8 %x, i8 %y) nounwind {
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edi, %r14d
-; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl %ebp, %eax
; CHECK-NEXT: notb %al
-; CHECK-NEXT: movzbl %al, %ebp
-; CHECK-NEXT: movl %ebp, %edi
+; CHECK-NEXT: movzbl %al, %r14d
+; CHECK-NEXT: movl %r14d, %edi
; CHECK-NEXT: callq use at PLT
-; CHECK-NEXT: andb %bl, %bpl
-; CHECK-NEXT: movzbl %bpl, %edi
+; CHECK-NEXT: andb %bl, %r14b
+; CHECK-NEXT: movzbl %r14b, %edi
; CHECK-NEXT: callq use at PLT
-; CHECK-NEXT: orb %r14b, %bl
+; CHECK-NEXT: orb %bpl, %bl
; CHECK-NEXT: movl %ebx, %eax
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll
index fdb5c020ad575..5ef3059863d38 100644
--- a/llvm/test/CodeGen/X86/addcarry.ll
+++ b/llvm/test/CodeGen/X86/addcarry.ll
@@ -316,20 +316,20 @@ define %S @readd(ptr nocapture readonly %this, %S %arg.b) nounwind {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: addq (%rsi), %rdx
-; CHECK-NEXT: movq 8(%rsi), %r11
-; CHECK-NEXT: adcq $0, %r11
+; CHECK-NEXT: movq 8(%rsi), %rdi
+; CHECK-NEXT: adcq $0, %rdi
; CHECK-NEXT: setb %r10b
-; CHECK-NEXT: movzbl %r10b, %edi
-; CHECK-NEXT: addq %rcx, %r11
-; CHECK-NEXT: adcq 16(%rsi), %rdi
+; CHECK-NEXT: movzbl %r10b, %r10d
+; CHECK-NEXT: addq %rcx, %rdi
+; CHECK-NEXT: adcq 16(%rsi), %r10
; CHECK-NEXT: setb %cl
; CHECK-NEXT: movzbl %cl, %ecx
-; CHECK-NEXT: addq %r8, %rdi
+; CHECK-NEXT: addq %r8, %r10
; CHECK-NEXT: adcq 24(%rsi), %rcx
; CHECK-NEXT: addq %r9, %rcx
; CHECK-NEXT: movq %rdx, (%rax)
-; CHECK-NEXT: movq %r11, 8(%rax)
-; CHECK-NEXT: movq %rdi, 16(%rax)
+; CHECK-NEXT: movq %rdi, 8(%rax)
+; CHECK-NEXT: movq %r10, 16(%rax)
; CHECK-NEXT: movq %rcx, 24(%rax)
; CHECK-NEXT: retq
entry:
@@ -751,27 +751,27 @@ define i32 @add_U320_without_i128_add(ptr nocapture dereferenceable(40) %0, i64
; CHECK-NEXT: adcq %rdx, 8(%rdi)
; CHECK-NEXT: movq %rax, %rdx
; CHECK-NEXT: adcq %rcx, %rdx
-; CHECK-NEXT: movq 24(%rdi), %r11
-; CHECK-NEXT: leaq (%r8,%r11), %r14
+; CHECK-NEXT: movq 24(%rdi), %rsi
+; CHECK-NEXT: leaq (%r8,%rsi), %r11
; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: cmpq %r10, %rdx
; CHECK-NEXT: setb %bl
; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: adcq %r14, %rbx
-; CHECK-NEXT: movq 32(%rdi), %r10
-; CHECK-NEXT: leaq (%r9,%r10), %rcx
-; CHECK-NEXT: xorl %esi, %esi
-; CHECK-NEXT: cmpq %r14, %rbx
-; CHECK-NEXT: setb %sil
-; CHECK-NEXT: addq %r11, %r8
-; CHECK-NEXT: adcq %rcx, %rsi
+; CHECK-NEXT: adcq %r11, %rbx
+; CHECK-NEXT: movq 32(%rdi), %rcx
+; CHECK-NEXT: leaq (%r9,%rcx), %r10
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: cmpq %r11, %rbx
+; CHECK-NEXT: setb %r14b
+; CHECK-NEXT: addq %rsi, %r8
+; CHECK-NEXT: adcq %r10, %r14
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: cmpq %rcx, %rsi
+; CHECK-NEXT: cmpq %r10, %r14
; CHECK-NEXT: setb %al
-; CHECK-NEXT: addq %r10, %r9
+; CHECK-NEXT: addq %rcx, %r9
; CHECK-NEXT: movq %rdx, 16(%rdi)
; CHECK-NEXT: movq %rbx, 24(%rdi)
-; CHECK-NEXT: movq %rsi, 32(%rdi)
+; CHECK-NEXT: movq %r14, 32(%rdi)
; CHECK-NEXT: adcl $0, %eax
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
@@ -1219,18 +1219,18 @@ define void @add_U256_without_i128_or_by_i64_words(ptr sret(%uint256) %0, ptr %1
; CHECK-LABEL: add_U256_without_i128_or_by_i64_words:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq (%rdx), %r8
+; CHECK-NEXT: movq (%rdx), %rcx
; CHECK-NEXT: movq 8(%rdx), %rdi
-; CHECK-NEXT: addq (%rsi), %r8
+; CHECK-NEXT: addq (%rsi), %rcx
; CHECK-NEXT: adcq 8(%rsi), %rdi
-; CHECK-NEXT: movq 16(%rdx), %rcx
-; CHECK-NEXT: adcq 16(%rsi), %rcx
+; CHECK-NEXT: movq 16(%rdx), %r8
+; CHECK-NEXT: adcq 16(%rsi), %r8
; CHECK-NEXT: movq 24(%rdx), %rdx
; CHECK-NEXT: adcq 24(%rsi), %rdx
; CHECK-NEXT: movq %rdx, (%rax)
-; CHECK-NEXT: movq %rcx, 8(%rax)
+; CHECK-NEXT: movq %r8, 8(%rax)
; CHECK-NEXT: movq %rdi, 16(%rax)
-; CHECK-NEXT: movq %r8, 24(%rax)
+; CHECK-NEXT: movq %rcx, 24(%rax)
; CHECK-NEXT: retq
%4 = load i64, ptr %1, align 8
%5 = load i64, ptr %2, align 8
@@ -1279,17 +1279,17 @@ define void @add_U256_without_i128_or_recursive(ptr sret(%uint256) %0, ptr %1, p
; CHECK-LABEL: add_U256_without_i128_or_recursive:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq (%rdx), %r8
+; CHECK-NEXT: movq (%rdx), %rcx
; CHECK-NEXT: movq 8(%rdx), %rdi
-; CHECK-NEXT: addq (%rsi), %r8
+; CHECK-NEXT: addq (%rsi), %rcx
; CHECK-NEXT: adcq 8(%rsi), %rdi
-; CHECK-NEXT: movq 16(%rdx), %rcx
+; CHECK-NEXT: movq 16(%rdx), %r8
; CHECK-NEXT: movq 24(%rdx), %rdx
-; CHECK-NEXT: adcq 16(%rsi), %rcx
+; CHECK-NEXT: adcq 16(%rsi), %r8
; CHECK-NEXT: adcq 24(%rsi), %rdx
-; CHECK-NEXT: movq %r8, (%rax)
+; CHECK-NEXT: movq %rcx, (%rax)
; CHECK-NEXT: movq %rdi, 8(%rax)
-; CHECK-NEXT: movq %rcx, 16(%rax)
+; CHECK-NEXT: movq %r8, 16(%rax)
; CHECK-NEXT: movq %rdx, 24(%rax)
; CHECK-NEXT: retq
%4 = load i64, ptr %1, align 8
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index c47a7bcec3383..216c2226b384c 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1771,56 +1771,52 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT: addq %rsi, %rcx
+; SSE2-NEXT: addq %rax, %rcx
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: addq %rbp, %rax
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT: leaq -1(%rdx,%rsi), %r11
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leaq -1(%rdi,%rdx), %rsi
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leaq -1(%r8,%rdx), %rdi
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leaq -1(%r9,%rdx), %r8
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leaq -1(%rbx,%rdx), %rbx
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leaq -1(%r10,%rdx), %r9
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leaq -1(%r13,%rdx), %r13
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leaq -1(%r12,%rdx), %r12
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leaq -1(%r14,%rdx), %r14
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leaq -1(%r15,%rdx), %r15
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%r13,%rbp), %r13
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%r12,%rbp), %r12
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%r15,%rbp), %r15
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%r14,%rbp), %r14
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%rbx,%rbp), %rbx
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%r11,%rbp), %r11
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%r10,%rbp), %r10
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%r9,%rbp), %r9
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%r8,%rbp), %r8
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%rdi,%rbp), %rdi
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%rsi,%rbp), %rsi
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: leaq -1(%rdx,%rbp), %rdx
; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE2-NEXT: leaq -1(%rbp,%rdx), %r10
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx
; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
@@ -1835,74 +1831,74 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-NEXT: adcq $-1, %rbp
; SSE2-NEXT: shldq $63, %rax, %rbp
; SSE2-NEXT: shldq $63, %rcx, %rdx
-; SSE2-NEXT: movq %rdx, %xmm8
+; SSE2-NEXT: movq %rdx, %xmm1
; SSE2-NEXT: movq %rbp, %xmm0
-; SSE2-NEXT: shrq %r11
-; SSE2-NEXT: movq %r11, %xmm9
-; SSE2-NEXT: shrq %rsi
-; SSE2-NEXT: movq %rsi, %xmm2
-; SSE2-NEXT: shrq %rdi
-; SSE2-NEXT: movq %rdi, %xmm10
-; SSE2-NEXT: shrq %r8
-; SSE2-NEXT: movq %r8, %xmm4
-; SSE2-NEXT: shrq %rbx
-; SSE2-NEXT: movq %rbx, %xmm11
-; SSE2-NEXT: shrq %r9
-; SSE2-NEXT: movq %r9, %xmm7
; SSE2-NEXT: shrq %r13
-; SSE2-NEXT: movq %r13, %xmm12
+; SSE2-NEXT: movq %r13, %xmm3
; SSE2-NEXT: shrq %r12
-; SSE2-NEXT: movq %r12, %xmm1
-; SSE2-NEXT: shrq %r14
-; SSE2-NEXT: movq %r14, %xmm13
+; SSE2-NEXT: movq %r12, %xmm2
; SSE2-NEXT: shrq %r15
-; SSE2-NEXT: movq %r15, %xmm6
+; SSE2-NEXT: movq %r15, %xmm5
+; SSE2-NEXT: shrq %r14
+; SSE2-NEXT: movq %r14, %xmm4
+; SSE2-NEXT: shrq %rbx
+; SSE2-NEXT: movq %rbx, %xmm6
+; SSE2-NEXT: shrq %r11
+; SSE2-NEXT: movq %r11, %xmm7
+; SSE2-NEXT: shrq %r10
+; SSE2-NEXT: movq %r10, %xmm9
+; SSE2-NEXT: shrq %r9
+; SSE2-NEXT: movq %r9, %xmm8
+; SSE2-NEXT: shrq %r8
+; SSE2-NEXT: movq %r8, %xmm10
+; SSE2-NEXT: shrq %rdi
+; SSE2-NEXT: movq %rdi, %xmm11
+; SSE2-NEXT: shrq %rsi
+; SSE2-NEXT: movq %rsi, %xmm12
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shrq %rax
-; SSE2-NEXT: movq %rax, %xmm14
-; SSE2-NEXT: shrq %r10
-; SSE2-NEXT: movq %r10, %xmm5
+; SSE2-NEXT: movq %rax, %xmm13
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shrq %rax
-; SSE2-NEXT: movq %rax, %xmm15
+; SSE2-NEXT: movq %rax, %xmm14
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: shrq %rax
-; SSE2-NEXT: movq %rax, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: movq %rax, %xmm15
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pslld $16, %xmm2
-; SSE2-NEXT: pandn %xmm2, %xmm8
-; SSE2-NEXT: por %xmm0, %xmm8
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
+; SSE2-NEXT: pandn %xmm2, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
; SSE2-NEXT: psllq $48, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; SSE2-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
-; SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5]
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm5, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1]
-; SSE2-NEXT: movupd %xmm1, (%rax)
+; SSE2-NEXT: por %xmm8, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; SSE2-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1]
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm13, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE2-NEXT: movupd %xmm2, (%rax)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
@@ -1927,118 +1923,116 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX1-NEXT: vpextrw $5, %xmm0, %eax
; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT: vpextrw $6, %xmm0, %eax
-; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT: vpextrw $7, %xmm0, %r11d
-; AVX1-NEXT: vpextrw $0, %xmm3, %r14d
-; AVX1-NEXT: vpextrw $1, %xmm3, %r15d
-; AVX1-NEXT: vpextrw $2, %xmm3, %r10d
-; AVX1-NEXT: vpextrw $3, %xmm3, %r9d
-; AVX1-NEXT: vpextrw $4, %xmm3, %r8d
-; AVX1-NEXT: vpextrw $5, %xmm3, %ebx
-; AVX1-NEXT: vpextrw $6, %xmm3, %ebp
-; AVX1-NEXT: vpextrw $7, %xmm3, %edi
-; AVX1-NEXT: vpextrw $1, %xmm0, %edx
-; AVX1-NEXT: vpextrw $0, %xmm0, %esi
+; AVX1-NEXT: vpextrw $6, %xmm0, %r10d
+; AVX1-NEXT: vpextrw $7, %xmm0, %edx
+; AVX1-NEXT: vpextrw $0, %xmm3, %edi
+; AVX1-NEXT: vpextrw $1, %xmm3, %r8d
+; AVX1-NEXT: vpextrw $2, %xmm3, %r9d
+; AVX1-NEXT: vpextrw $3, %xmm3, %r11d
+; AVX1-NEXT: vpextrw $4, %xmm3, %ebx
+; AVX1-NEXT: vpextrw $5, %xmm3, %r14d
+; AVX1-NEXT: vpextrw $6, %xmm3, %r15d
+; AVX1-NEXT: vpextrw $7, %xmm3, %esi
+; AVX1-NEXT: vpextrw $1, %xmm0, %r13d
+; AVX1-NEXT: vpextrw $0, %xmm0, %r12d
; AVX1-NEXT: vpextrw $1, %xmm1, %ecx
-; AVX1-NEXT: addq %rdx, %rcx
+; AVX1-NEXT: addq %r13, %rcx
; AVX1-NEXT: vpextrw $0, %xmm1, %eax
-; AVX1-NEXT: addq %rsi, %rax
-; AVX1-NEXT: vpextrw $7, %xmm2, %edx
-; AVX1-NEXT: leaq -1(%rdi,%rdx), %rdi
-; AVX1-NEXT: vpextrw $6, %xmm2, %edx
-; AVX1-NEXT: leaq -1(%rbp,%rdx), %rbp
-; AVX1-NEXT: vpextrw $5, %xmm2, %edx
-; AVX1-NEXT: leaq -1(%rbx,%rdx), %rbx
-; AVX1-NEXT: vpextrw $4, %xmm2, %edx
-; AVX1-NEXT: leaq -1(%r8,%rdx), %r8
-; AVX1-NEXT: vpextrw $3, %xmm2, %edx
-; AVX1-NEXT: leaq -1(%r9,%rdx), %r9
-; AVX1-NEXT: vpextrw $2, %xmm2, %edx
-; AVX1-NEXT: leaq -1(%r10,%rdx), %r10
-; AVX1-NEXT: vpextrw $1, %xmm2, %edx
-; AVX1-NEXT: leaq -1(%r15,%rdx), %r13
-; AVX1-NEXT: vpextrw $0, %xmm2, %edx
-; AVX1-NEXT: leaq -1(%r14,%rdx), %r12
-; AVX1-NEXT: vpextrw $7, %xmm1, %edx
-; AVX1-NEXT: leaq -1(%r11,%rdx), %r15
+; AVX1-NEXT: addq %r12, %rax
+; AVX1-NEXT: vpextrw $7, %xmm2, %r12d
+; AVX1-NEXT: leaq -1(%rsi,%r12), %rsi
+; AVX1-NEXT: vpextrw $6, %xmm2, %r12d
+; AVX1-NEXT: leaq -1(%r15,%r12), %rbp
+; AVX1-NEXT: vpextrw $5, %xmm2, %r15d
+; AVX1-NEXT: leaq -1(%r14,%r15), %r13
+; AVX1-NEXT: vpextrw $4, %xmm2, %r14d
+; AVX1-NEXT: leaq -1(%rbx,%r14), %r12
+; AVX1-NEXT: vpextrw $3, %xmm2, %ebx
+; AVX1-NEXT: leaq -1(%r11,%rbx), %r15
+; AVX1-NEXT: vpextrw $2, %xmm2, %r11d
+; AVX1-NEXT: leaq -1(%r9,%r11), %r14
+; AVX1-NEXT: vpextrw $1, %xmm2, %r9d
+; AVX1-NEXT: leaq -1(%r8,%r9), %rbx
+; AVX1-NEXT: vpextrw $0, %xmm2, %r8d
+; AVX1-NEXT: leaq -1(%rdi,%r8), %r11
+; AVX1-NEXT: vpextrw $7, %xmm1, %edi
+; AVX1-NEXT: leaq -1(%rdx,%rdi), %r9
; AVX1-NEXT: vpextrw $6, %xmm1, %edx
-; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX1-NEXT: leaq -1(%rsi,%rdx), %r14
+; AVX1-NEXT: leaq -1(%r10,%rdx), %r8
; AVX1-NEXT: vpextrw $5, %xmm1, %edx
-; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX1-NEXT: leaq -1(%rsi,%rdx), %r11
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX1-NEXT: leaq -1(%rdi,%rdx), %rdi
; AVX1-NEXT: vpextrw $4, %xmm1, %edx
-; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX1-NEXT: leaq -1(%rsi,%rdx), %rdx
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX1-NEXT: leaq -1(%r10,%rdx), %rdx
; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX1-NEXT: vpextrw $3, %xmm0, %edx
-; AVX1-NEXT: vpextrw $3, %xmm1, %esi
-; AVX1-NEXT: leaq -1(%rdx,%rsi), %rdx
+; AVX1-NEXT: vpextrw $3, %xmm1, %r10d
+; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx
; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX1-NEXT: vpextrw $2, %xmm0, %edx
-; AVX1-NEXT: vpextrw $2, %xmm1, %esi
-; AVX1-NEXT: leaq -1(%rdx,%rsi), %rdx
+; AVX1-NEXT: vpextrw $2, %xmm1, %r10d
+; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx
; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX1-NEXT: xorl %edx, %edx
; AVX1-NEXT: addq $-1, %rcx
-; AVX1-NEXT: movl $0, %esi
-; AVX1-NEXT: adcq $-1, %rsi
+; AVX1-NEXT: movl $0, %r10d
+; AVX1-NEXT: adcq $-1, %r10
; AVX1-NEXT: addq $-1, %rax
; AVX1-NEXT: adcq $-1, %rdx
; AVX1-NEXT: shldq $63, %rax, %rdx
-; AVX1-NEXT: shldq $63, %rcx, %rsi
-; AVX1-NEXT: shrq %rdi
-; AVX1-NEXT: vmovq %rdi, %xmm8
+; AVX1-NEXT: shldq $63, %rcx, %r10
+; AVX1-NEXT: shrq %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm0
; AVX1-NEXT: shrq %rbp
-; AVX1-NEXT: vmovq %rbp, %xmm9
-; AVX1-NEXT: shrq %rbx
-; AVX1-NEXT: vmovq %rbx, %xmm0
-; AVX1-NEXT: shrq %r8
-; AVX1-NEXT: vmovq %r8, %xmm1
-; AVX1-NEXT: shrq %r9
-; AVX1-NEXT: vmovq %r9, %xmm12
-; AVX1-NEXT: shrq %r10
-; AVX1-NEXT: vmovq %r10, %xmm13
+; AVX1-NEXT: vmovq %rbp, %xmm1
; AVX1-NEXT: shrq %r13
-; AVX1-NEXT: vmovq %r13, %xmm14
+; AVX1-NEXT: vmovq %r13, %xmm2
; AVX1-NEXT: shrq %r12
-; AVX1-NEXT: vmovq %r12, %xmm15
+; AVX1-NEXT: vmovq %r12, %xmm3
; AVX1-NEXT: shrq %r15
-; AVX1-NEXT: vmovq %r15, %xmm10
+; AVX1-NEXT: vmovq %r15, %xmm4
; AVX1-NEXT: shrq %r14
-; AVX1-NEXT: vmovq %r14, %xmm11
+; AVX1-NEXT: vmovq %r14, %xmm5
+; AVX1-NEXT: shrq %rbx
+; AVX1-NEXT: vmovq %rbx, %xmm6
; AVX1-NEXT: shrq %r11
-; AVX1-NEXT: vmovq %r11, %xmm2
+; AVX1-NEXT: vmovq %r11, %xmm7
+; AVX1-NEXT: shrq %r9
+; AVX1-NEXT: vmovq %r9, %xmm8
+; AVX1-NEXT: shrq %r8
+; AVX1-NEXT: vmovq %r8, %xmm9
+; AVX1-NEXT: shrq %rdi
+; AVX1-NEXT: vmovq %rdi, %xmm10
; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: vmovq %rax, %xmm3
-; AVX1-NEXT: vmovq %rsi, %xmm4
-; AVX1-NEXT: vmovq %rdx, %xmm5
+; AVX1-NEXT: vmovq %rax, %xmm11
+; AVX1-NEXT: vmovq %r10, %xmm12
+; AVX1-NEXT: vmovq %rdx, %xmm13
; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: vmovq %rax, %xmm6
+; AVX1-NEXT: vmovq %rax, %xmm14
; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: vmovq %rax, %xmm7
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,0,0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6],xmm8[7]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm8[6,7]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT: vmovq %rax, %xmm15
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5],xmm2[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
; AVX1-NEXT: vpslld $16, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
@@ -2064,7 +2058,7 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm9
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
@@ -2074,63 +2068,62 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX2-NEXT: vmovq %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT: vmovq %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: vmovq %xmm7, %r13
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT: vmovq %xmm2, %r11
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %r14
-; AVX2-NEXT: vpextrq $1, %xmm0, %rbx
-; AVX2-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX2-NEXT: vpextrq $1, %xmm7, %r12
-; AVX2-NEXT: vpextrq $1, %xmm6, %r15
+; AVX2-NEXT: vmovq %xmm2, %rbp
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm8
+; AVX2-NEXT: vmovq %xmm8, %r8
+; AVX2-NEXT: vpextrq $1, %xmm8, %r15
+; AVX2-NEXT: vpextrq $1, %xmm2, %r14
+; AVX2-NEXT: vpextrq $1, %xmm7, %rbx
+; AVX2-NEXT: vpextrq $1, %xmm6, %rsi
; AVX2-NEXT: vpextrq $1, %xmm5, %rdx
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
; AVX2-NEXT: vpextrq $1, %xmm3, %rax
-; AVX2-NEXT: vmovq %xmm3, %rbp
-; AVX2-NEXT: vpextrq $1, %xmm9, %r9
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vmovq %xmm3, %rdi
+; AVX2-NEXT: vpextrq $1, %xmm0, %r10
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm0
-; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
-; AVX2-NEXT: addq %rbx, %rdi
-; AVX2-NEXT: movq %rdi, %rbx
-; AVX2-NEXT: vpextrq $1, %xmm8, %r10
-; AVX2-NEXT: addq %rsi, %r10
-; AVX2-NEXT: vpextrq $1, %xmm7, %rsi
-; AVX2-NEXT: addq %r12, %rsi
-; AVX2-NEXT: movq %rsi, %r12
-; AVX2-NEXT: vpextrq $1, %xmm4, %r13
-; AVX2-NEXT: addq %r15, %r13
-; AVX2-NEXT: vpextrq $1, %xmm5, %r15
-; AVX2-NEXT: addq %rdx, %r15
-; AVX2-NEXT: vpextrq $1, %xmm3, %r8
-; AVX2-NEXT: addq %rcx, %r8
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-NEXT: vpextrq $1, %xmm9, %r11
+; AVX2-NEXT: addq %r15, %r11
+; AVX2-NEXT: vpextrq $1, %xmm8, %r9
+; AVX2-NEXT: addq %r14, %r9
+; AVX2-NEXT: movq %r9, %r14
+; AVX2-NEXT: vpextrq $1, %xmm7, %r9
+; AVX2-NEXT: addq %rbx, %r9
+; AVX2-NEXT: movq %r9, %rbx
+; AVX2-NEXT: vpextrq $1, %xmm4, %r15
+; AVX2-NEXT: addq %rsi, %r15
+; AVX2-NEXT: vpextrq $1, %xmm5, %r12
+; AVX2-NEXT: addq %rdx, %r12
+; AVX2-NEXT: vpextrq $1, %xmm3, %r9
+; AVX2-NEXT: addq %rcx, %r9
; AVX2-NEXT: vpextrq $1, %xmm6, %rsi
; AVX2-NEXT: addq %rax, %rsi
; AVX2-NEXT: vmovq %xmm6, %rdx
-; AVX2-NEXT: addq %rbp, %rdx
+; AVX2-NEXT: addq %rdi, %rdx
; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
-; AVX2-NEXT: addq %r9, %rcx
-; AVX2-NEXT: vmovq %xmm0, %rdi
-; AVX2-NEXT: leaq -1(%r14,%rdi), %rax
+; AVX2-NEXT: addq %r10, %rcx
+; AVX2-NEXT: vmovq %xmm9, %r10
+; AVX2-NEXT: leaq -1(%r8,%r10), %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vmovq %xmm8, %rdi
-; AVX2-NEXT: leaq -1(%r11,%rdi), %rax
+; AVX2-NEXT: leaq -1(%rbp,%rdi), %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vmovq %xmm7, %rdi
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: leaq -1(%rax,%rdi), %rax
+; AVX2-NEXT: leaq -1(%r13,%rdi), %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vmovq %xmm4, %rdi
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
@@ -2141,37 +2134,37 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX2-NEXT: leaq -1(%rax,%rdi), %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vmovq %xmm1, %rdi
-; AVX2-NEXT: vmovq %xmm3, %rbp
-; AVX2-NEXT: leaq -1(%rdi,%rbp), %rax
+; AVX2-NEXT: vmovq %xmm3, %r8
+; AVX2-NEXT: leaq -1(%rdi,%r8), %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: vmovq %xmm9, %rdi
-; AVX2-NEXT: vmovq %xmm2, %rbp
-; AVX2-NEXT: leaq -1(%rdi,%rbp), %rdi
+; AVX2-NEXT: vmovq %xmm0, %rdi
+; AVX2-NEXT: vmovq %xmm2, %r8
+; AVX2-NEXT: leaq -1(%rdi,%r8), %rdi
; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: addq $-1, %rbx
-; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movl $0, %r9d
-; AVX2-NEXT: adcq $-1, %r9
-; AVX2-NEXT: addq $-1, %r10
-; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: addq $-1, %r11
+; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl $0, %r8d
+; AVX2-NEXT: adcq $-1, %r8
+; AVX2-NEXT: addq $-1, %r14
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: movl $0, %edi
; AVX2-NEXT: adcq $-1, %rdi
-; AVX2-NEXT: addq $-1, %r12
-; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: addq $-1, %rbx
+; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: movl $0, %r11d
; AVX2-NEXT: adcq $-1, %r11
-; AVX2-NEXT: addq $-1, %r13
+; AVX2-NEXT: addq $-1, %r15
; AVX2-NEXT: movl $0, %r10d
; AVX2-NEXT: adcq $-1, %r10
-; AVX2-NEXT: addq $-1, %r15
+; AVX2-NEXT: addq $-1, %r12
; AVX2-NEXT: movl $0, %r14d
; AVX2-NEXT: adcq $-1, %r14
-; AVX2-NEXT: addq $-1, %r8
+; AVX2-NEXT: addq $-1, %r9
; AVX2-NEXT: movl $0, %ebp
; AVX2-NEXT: adcq $-1, %rbp
; AVX2-NEXT: addq $-1, %rsi
-; AVX2-NEXT: movl $0, %r12d
-; AVX2-NEXT: adcq $-1, %r12
+; AVX2-NEXT: movl $0, %r13d
+; AVX2-NEXT: adcq $-1, %r13
; AVX2-NEXT: addq $-1, %rdx
; AVX2-NEXT: movl $0, %ebx
; AVX2-NEXT: adcq $-1, %rbx
@@ -2180,64 +2173,64 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX2-NEXT: adcq $-1, %rax
; AVX2-NEXT: shldq $63, %rcx, %rax
; AVX2-NEXT: shldq $63, %rdx, %rbx
-; AVX2-NEXT: shldq $63, %rsi, %r12
-; AVX2-NEXT: shldq $63, %r8, %rbp
-; AVX2-NEXT: shldq $63, %r15, %r14
-; AVX2-NEXT: shldq $63, %r13, %r10
+; AVX2-NEXT: shldq $63, %rsi, %r13
+; AVX2-NEXT: shldq $63, %r9, %rbp
+; AVX2-NEXT: shldq $63, %r12, %r14
+; AVX2-NEXT: shldq $63, %r15, %r10
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: shldq $63, %rcx, %r11
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: shldq $63, %rcx, %rdi
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: shldq $63, %rcx, %r9
-; AVX2-NEXT: vmovq %r9, %xmm8
+; AVX2-NEXT: shldq $63, %rcx, %r8
+; AVX2-NEXT: vmovq %r8, %xmm0
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm9
-; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vmovq %rdi, %xmm2
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vmovq %r11, %xmm12
+; AVX2-NEXT: vmovq %rcx, %xmm3
+; AVX2-NEXT: vmovq %r11, %xmm4
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm13
-; AVX2-NEXT: vmovq %r10, %xmm14
+; AVX2-NEXT: vmovq %rcx, %xmm5
+; AVX2-NEXT: vmovq %r10, %xmm6
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm15
-; AVX2-NEXT: vmovq %r14, %xmm10
+; AVX2-NEXT: vmovq %rcx, %xmm7
+; AVX2-NEXT: vmovq %r14, %xmm8
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm11
-; AVX2-NEXT: vmovq %rbp, %xmm2
+; AVX2-NEXT: vmovq %rcx, %xmm9
+; AVX2-NEXT: vmovq %rbp, %xmm10
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm3
-; AVX2-NEXT: vmovq %r12, %xmm4
-; AVX2-NEXT: vmovq %rbx, %xmm5
-; AVX2-NEXT: vmovq %rax, %xmm6
+; AVX2-NEXT: vmovq %rcx, %xmm11
+; AVX2-NEXT: vmovq %r13, %xmm12
+; AVX2-NEXT: vmovq %rbx, %xmm13
+; AVX2-NEXT: vmovq %rax, %xmm14
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: vmovq %rax, %xmm7
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT: vpbroadcastw %xmm8, %xmm8
-; AVX2-NEXT: vpbroadcastw %xmm9, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX2-NEXT: vmovq %rax, %xmm15
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm9, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5],xmm2[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
; AVX2-NEXT: vpslld $16, %xmm3, %xmm3
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
@@ -2266,25 +2259,25 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT: vmovq %xmm4, %rbx
-; AVX512-NEXT: vpextrq $1, %xmm4, %rbp
-; AVX512-NEXT: vmovq %xmm3, %rdi
-; AVX512-NEXT: vpextrq $1, %xmm3, %rsi
+; AVX512-NEXT: vmovq %xmm4, %r13
+; AVX512-NEXT: vpextrq $1, %xmm4, %r12
+; AVX512-NEXT: vmovq %xmm3, %r15
+; AVX512-NEXT: vpextrq $1, %xmm3, %r14
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT: vmovq %xmm2, %rdx
-; AVX512-NEXT: vpextrq $1, %xmm2, %r10
+; AVX512-NEXT: vmovq %xmm2, %rbx
+; AVX512-NEXT: vpextrq $1, %xmm2, %r11
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %r13
-; AVX512-NEXT: vpextrq $1, %xmm2, %r14
+; AVX512-NEXT: vmovq %xmm2, %r10
+; AVX512-NEXT: vpextrq $1, %xmm2, %rax
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT: vmovq %xmm2, %r15
+; AVX512-NEXT: vmovq %xmm2, %rdi
; AVX512-NEXT: vpextrq $1, %xmm2, %r8
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %r11
-; AVX512-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: vmovq %xmm2, %rsi
+; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
@@ -2293,50 +2286,48 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT: vmovq %xmm4, %rax
-; AVX512-NEXT: addq %rbx, %rax
-; AVX512-NEXT: movq %rax, %rbx
-; AVX512-NEXT: vpextrq $1, %xmm4, %rax
-; AVX512-NEXT: addq %rbp, %rax
-; AVX512-NEXT: movq %rax, %r9
+; AVX512-NEXT: vmovq %xmm4, %rbp
+; AVX512-NEXT: addq %r13, %rbp
+; AVX512-NEXT: vpextrq $1, %xmm4, %r13
+; AVX512-NEXT: addq %r12, %r13
; AVX512-NEXT: vmovq %xmm3, %rcx
-; AVX512-NEXT: addq %rdi, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512-NEXT: addq %rsi, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: addq %r15, %rcx
+; AVX512-NEXT: vpextrq $1, %xmm3, %r9
+; AVX512-NEXT: addq %r14, %r9
+; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT: vmovq %xmm2, %rbp
-; AVX512-NEXT: addq %rdx, %rbp
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: addq %r10, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vmovq %xmm2, %r9
+; AVX512-NEXT: addq %rbx, %r9
+; AVX512-NEXT: vpextrq $1, %xmm2, %rbx
+; AVX512-NEXT: addq %r11, %rbx
+; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %rax
-; AVX512-NEXT: addq %r13, %rax
-; AVX512-NEXT: movq %rax, %r13
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: addq %r14, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vmovq %xmm2, %r11
+; AVX512-NEXT: addq %r10, %r11
+; AVX512-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vpextrq $1, %xmm2, %r10
+; AVX512-NEXT: addq %rax, %r10
+; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512-NEXT: vmovq %xmm2, %rax
-; AVX512-NEXT: addq %r15, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: addq %rdi, %rax
+; AVX512-NEXT: movq %rax, %r12
; AVX512-NEXT: vpextrq $1, %xmm2, %rax
; AVX512-NEXT: addq %r8, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512-NEXT: vmovq %xmm2, %rax
-; AVX512-NEXT: addq %r11, %rax
+; AVX512-NEXT: addq %rsi, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: vpextrq $1, %xmm2, %r12
-; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX512-NEXT: vpextrq $1, %xmm2, %r15
+; AVX512-NEXT: addq %rdx, %r15
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vmovq %xmm0, %r11
-; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX512-NEXT: vmovq %xmm0, %r10
+; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
; AVX512-NEXT: vpextrq $1, %xmm0, %r8
; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
; AVX512-NEXT: vmovq %xmm1, %rax
@@ -2346,34 +2337,29 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX512-NEXT: vpextrq $1, %xmm1, %rsi
; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
; AVX512-NEXT: addq %rsi, %rdx
-; AVX512-NEXT: addq $-1, %rbx
-; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movl $0, %r15d
-; AVX512-NEXT: adcq $-1, %r15
-; AVX512-NEXT: addq $-1, %r9
-; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: addq $-1, %rbp
+; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: movl $0, %r14d
; AVX512-NEXT: adcq $-1, %r14
+; AVX512-NEXT: addq $-1, %r13
+; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movl $0, %ebx
+; AVX512-NEXT: adcq $-1, %rbx
; AVX512-NEXT: addq $-1, %rcx
-; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq %rcx, (%rsp) # 8-byte Spill
; AVX512-NEXT: movl $0, %esi
; AVX512-NEXT: adcq $-1, %rsi
; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movl $0, %r10d
-; AVX512-NEXT: adcq $-1, %r10
-; AVX512-NEXT: addq $-1, %rbp
-; AVX512-NEXT: movq %rbp, (%rsp) # 8-byte Spill
+; AVX512-NEXT: movl $0, %r11d
+; AVX512-NEXT: adcq $-1, %r11
+; AVX512-NEXT: addq $-1, %r9
+; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: movl $0, %r9d
; AVX512-NEXT: adcq $-1, %r9
; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: addq $-1, %r13
-; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movl $0, %eax
-; AVX512-NEXT: adcq $-1, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
@@ -2382,70 +2368,73 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: addq $-1, %r12
+; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movl $0, %ebp
+; AVX512-NEXT: adcq $-1, %rbp
; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT: addq $-1, %rcx
+; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: addq $-1, %r12
+; AVX512-NEXT: addq $-1, %r15
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: addq $-1, %r11
-; AVX512-NEXT: movl $0, %r13d
-; AVX512-NEXT: adcq $-1, %r13
+; AVX512-NEXT: addq $-1, %r10
+; AVX512-NEXT: movl $0, %r12d
+; AVX512-NEXT: adcq $-1, %r12
; AVX512-NEXT: addq $-1, %r8
+; AVX512-NEXT: movl $0, %ecx
+; AVX512-NEXT: adcq $-1, %rcx
+; AVX512-NEXT: addq $-1, %rdi
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
-; AVX512-NEXT: addq $-1, %rdi
-; AVX512-NEXT: movl $0, %ebx
-; AVX512-NEXT: adcq $-1, %rbx
; AVX512-NEXT: addq $-1, %rdx
-; AVX512-NEXT: movl $0, %ebp
-; AVX512-NEXT: adcq $-1, %rbp
-; AVX512-NEXT: shldq $63, %rdx, %rbp
-; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq $63, %rdi, %rbx
-; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq $63, %r8, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: shldq $63, %r11, %r13
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: shldq $63, %r12, %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT: shldq $63, %rcx, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT: movl $0, %r13d
+; AVX512-NEXT: adcq $-1, %r13
+; AVX512-NEXT: shldq $63, %rdx, %r13
+; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq $63, %rdi, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq $63, %r8, %rcx
+; AVX512-NEXT: movq %rcx, %r13
+; AVX512-NEXT: shldq $63, %r10, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: shldq $63, %r15, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: shldq $63, %rax, %rbp
+; AVX512-NEXT: shldq $63, %rax, %rdi
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: shldq $63, %rax, %rbx
+; AVX512-NEXT: shldq $63, %rax, %r10
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT: shldq $63, %rax, %rdi
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %rbp
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; AVX512-NEXT: shldq $63, %rax, %rdx
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %r15
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX512-NEXT: shldq $63, %rax, %rcx
-; AVX512-NEXT: movq (%rsp), %rax # 8-byte Reload
-; AVX512-NEXT: shldq $63, %rax, %r9
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: shldq $63, %rax, %r10
+; AVX512-NEXT: shldq $63, %rax, %r9
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: shldq $63, %rax, %r11
+; AVX512-NEXT: movq (%rsp), %rax # 8-byte Reload
; AVX512-NEXT: shldq $63, %rax, %rsi
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: shldq $63, %rax, %r14
+; AVX512-NEXT: shldq $63, %rax, %rbx
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: shldq $63, %rax, %r15
-; AVX512-NEXT: vmovq %r15, %xmm0
-; AVX512-NEXT: vmovq %r14, %xmm1
-; AVX512-NEXT: vmovq %r10, %xmm2
+; AVX512-NEXT: shldq $63, %rax, %r14
+; AVX512-NEXT: vmovq %r14, %xmm0
+; AVX512-NEXT: vmovq %rbx, %xmm1
+; AVX512-NEXT: vmovq %r11, %xmm2
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: vmovq %rsi, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
@@ -2467,8 +2456,8 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vmovd %xmm2, %eax
; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %rdx, %xmm2
-; AVX512-NEXT: vmovq %rdi, %xmm3
+; AVX512-NEXT: vmovq %r15, %xmm2
+; AVX512-NEXT: vmovq %rdx, %xmm3
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2
@@ -2477,16 +2466,15 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
; AVX512-NEXT: vmovd %xmm1, %eax
; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX512-NEXT: movq %rbx, %rax
-; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %rbx, %xmm1
-; AVX512-NEXT: vmovq %rbp, %xmm2
+; AVX512-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX512-NEXT: vmovq %rbp, %xmm1
+; AVX512-NEXT: vmovq %r10, %xmm2
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vmovd %xmm2, %eax
; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %r12, %xmm2
-; AVX512-NEXT: vmovq %r11, %xmm3
+; AVX512-NEXT: vmovq %rdi, %xmm2
+; AVX512-NEXT: vmovq %r8, %xmm3
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2
@@ -2495,9 +2483,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
; AVX512-NEXT: vmovd %xmm1, %eax
; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $12, %r13d, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %r13, %xmm1
-; AVX512-NEXT: vmovq %r8, %xmm2
+; AVX512-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
+; AVX512-NEXT: vmovq %r12, %xmm1
+; AVX512-NEXT: vmovq %r13, %xmm2
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vmovd %xmm2, %eax
diff --git a/llvm/test/CodeGen/X86/avoid-sfb.ll b/llvm/test/CodeGen/X86/avoid-sfb.ll
index e710ee2d629ea..22b4fddf88e45 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb.ll
+++ b/llvm/test/CodeGen/X86/avoid-sfb.ll
@@ -635,21 +635,21 @@ define void @test_limit_all(ptr noalias %s1, ptr nocapture %s2, i32 %x, ptr noc
; SSE-NEXT: movq %r8, %r15
; SSE-NEXT: movq %rcx, %r14
; SSE-NEXT: movl %edx, %ebp
-; SSE-NEXT: movq %rsi, %r12
-; SSE-NEXT: movq %rdi, %rbx
+; SSE-NEXT: movq %rsi, %rbx
+; SSE-NEXT: movq %rdi, %r12
; SSE-NEXT: movl %r9d, 12(%rdi)
; SSE-NEXT: callq bar at PLT
; SSE-NEXT: cmpl $18, %ebp
; SSE-NEXT: jl .LBB9_2
; SSE-NEXT: # %bb.1: # %if.then
-; SSE-NEXT: movl %ebp, 4(%rbx)
-; SSE-NEXT: movq %rbx, %rdi
+; SSE-NEXT: movl %ebp, 4(%r12)
+; SSE-NEXT: movq %r12, %rdi
; SSE-NEXT: callq bar at PLT
; SSE-NEXT: .LBB9_2: # %if.end
; SSE-NEXT: movups (%r15), %xmm0
; SSE-NEXT: movups %xmm0, (%r14)
-; SSE-NEXT: movups (%rbx), %xmm0
-; SSE-NEXT: movups %xmm0, (%r12)
+; SSE-NEXT: movups (%r12), %xmm0
+; SSE-NEXT: movups %xmm0, (%rbx)
; SSE-NEXT: popq %rbx
; SSE-NEXT: .cfi_def_cfa_offset 40
; SSE-NEXT: popq %r12
@@ -682,21 +682,21 @@ define void @test_limit_all(ptr noalias %s1, ptr nocapture %s2, i32 %x, ptr noc
; AVX-NEXT: movq %r8, %r15
; AVX-NEXT: movq %rcx, %r14
; AVX-NEXT: movl %edx, %ebp
-; AVX-NEXT: movq %rsi, %r12
-; AVX-NEXT: movq %rdi, %rbx
+; AVX-NEXT: movq %rsi, %rbx
+; AVX-NEXT: movq %rdi, %r12
; AVX-NEXT: movl %r9d, 12(%rdi)
; AVX-NEXT: callq bar at PLT
; AVX-NEXT: cmpl $18, %ebp
; AVX-NEXT: jl .LBB9_2
; AVX-NEXT: # %bb.1: # %if.then
-; AVX-NEXT: movl %ebp, 4(%rbx)
-; AVX-NEXT: movq %rbx, %rdi
+; AVX-NEXT: movl %ebp, 4(%r12)
+; AVX-NEXT: movq %r12, %rdi
; AVX-NEXT: callq bar at PLT
; AVX-NEXT: .LBB9_2: # %if.end
; AVX-NEXT: vmovups (%r15), %xmm0
; AVX-NEXT: vmovups %xmm0, (%r14)
-; AVX-NEXT: vmovups (%rbx), %xmm0
-; AVX-NEXT: vmovups %xmm0, (%r12)
+; AVX-NEXT: vmovups (%r12), %xmm0
+; AVX-NEXT: vmovups %xmm0, (%rbx)
; AVX-NEXT: popq %rbx
; AVX-NEXT: .cfi_def_cfa_offset 40
; AVX-NEXT: popq %r12
@@ -747,24 +747,24 @@ define void @test_limit_one_pred(ptr noalias %s1, ptr nocapture %s2, i32 %x, ptr
; CHECK-NEXT: .cfi_offset %r15, -16
; CHECK-NEXT: movq %r8, %r12
; CHECK-NEXT: movq %rcx, %r15
-; CHECK-NEXT: movq %rsi, %r14
-; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: movq %rdi, %r14
; CHECK-NEXT: movl %r9d, 12(%rdi)
; CHECK-NEXT: cmpl $18, %edx
; CHECK-NEXT: jl .LBB10_2
; CHECK-NEXT: # %bb.1: # %if.then
-; CHECK-NEXT: movl %edx, 4(%rbx)
-; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: movl %edx, 4(%r14)
+; CHECK-NEXT: movq %r14, %rdi
; CHECK-NEXT: callq bar at PLT
; CHECK-NEXT: .LBB10_2: # %if.end
; CHECK-NEXT: movups (%r12), %xmm0
; CHECK-NEXT: movups %xmm0, (%r15)
-; CHECK-NEXT: movq (%rbx), %rax
-; CHECK-NEXT: movq %rax, (%r14)
-; CHECK-NEXT: movl 8(%rbx), %eax
-; CHECK-NEXT: movl %eax, 8(%r14)
-; CHECK-NEXT: movl 12(%rbx), %eax
-; CHECK-NEXT: movl %eax, 12(%r14)
+; CHECK-NEXT: movq (%r14), %rax
+; CHECK-NEXT: movq %rax, (%rbx)
+; CHECK-NEXT: movl 8(%r14), %eax
+; CHECK-NEXT: movl %eax, 8(%rbx)
+; CHECK-NEXT: movl 12(%r14), %eax
+; CHECK-NEXT: movl %eax, 12(%rbx)
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 40
; CHECK-NEXT: popq %rbx
@@ -795,20 +795,20 @@ define void @test_limit_one_pred(ptr noalias %s1, ptr nocapture %s2, i32 %x, ptr
; DISABLED-NEXT: .cfi_offset %r15, -16
; DISABLED-NEXT: movq %r8, %r15
; DISABLED-NEXT: movq %rcx, %r14
-; DISABLED-NEXT: movq %rsi, %r12
-; DISABLED-NEXT: movq %rdi, %rbx
+; DISABLED-NEXT: movq %rsi, %rbx
+; DISABLED-NEXT: movq %rdi, %r12
; DISABLED-NEXT: movl %r9d, 12(%rdi)
; DISABLED-NEXT: cmpl $18, %edx
; DISABLED-NEXT: jl .LBB10_2
; DISABLED-NEXT: # %bb.1: # %if.then
-; DISABLED-NEXT: movl %edx, 4(%rbx)
-; DISABLED-NEXT: movq %rbx, %rdi
+; DISABLED-NEXT: movl %edx, 4(%r12)
+; DISABLED-NEXT: movq %r12, %rdi
; DISABLED-NEXT: callq bar at PLT
; DISABLED-NEXT: .LBB10_2: # %if.end
; DISABLED-NEXT: movups (%r15), %xmm0
; DISABLED-NEXT: movups %xmm0, (%r14)
-; DISABLED-NEXT: movups (%rbx), %xmm0
-; DISABLED-NEXT: movups %xmm0, (%r12)
+; DISABLED-NEXT: movups (%r12), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rbx)
; DISABLED-NEXT: addq $8, %rsp
; DISABLED-NEXT: .cfi_def_cfa_offset 40
; DISABLED-NEXT: popq %rbx
@@ -839,24 +839,24 @@ define void @test_limit_one_pred(ptr noalias %s1, ptr nocapture %s2, i32 %x, ptr
; AVX-NEXT: .cfi_offset %r15, -16
; AVX-NEXT: movq %r8, %r12
; AVX-NEXT: movq %rcx, %r15
-; AVX-NEXT: movq %rsi, %r14
-; AVX-NEXT: movq %rdi, %rbx
+; AVX-NEXT: movq %rsi, %rbx
+; AVX-NEXT: movq %rdi, %r14
; AVX-NEXT: movl %r9d, 12(%rdi)
; AVX-NEXT: cmpl $18, %edx
; AVX-NEXT: jl .LBB10_2
; AVX-NEXT: # %bb.1: # %if.then
-; AVX-NEXT: movl %edx, 4(%rbx)
-; AVX-NEXT: movq %rbx, %rdi
+; AVX-NEXT: movl %edx, 4(%r14)
+; AVX-NEXT: movq %r14, %rdi
; AVX-NEXT: callq bar at PLT
; AVX-NEXT: .LBB10_2: # %if.end
; AVX-NEXT: vmovups (%r12), %xmm0
; AVX-NEXT: vmovups %xmm0, (%r15)
-; AVX-NEXT: movq (%rbx), %rax
-; AVX-NEXT: movq %rax, (%r14)
-; AVX-NEXT: movl 8(%rbx), %eax
-; AVX-NEXT: movl %eax, 8(%r14)
-; AVX-NEXT: movl 12(%rbx), %eax
-; AVX-NEXT: movl %eax, 12(%r14)
+; AVX-NEXT: movq (%r14), %rax
+; AVX-NEXT: movq %rax, (%rbx)
+; AVX-NEXT: movl 8(%r14), %eax
+; AVX-NEXT: movl %eax, 8(%rbx)
+; AVX-NEXT: movl 12(%r14), %eax
+; AVX-NEXT: movl %eax, 12(%rbx)
; AVX-NEXT: addq $8, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 40
; AVX-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index 728c15cf30150..40bb10bcd27f6 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -1486,10 +1486,10 @@ define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8
;
; X64-LABEL: test_mm256_set_epi8:
; X64: # %bb.0:
-; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; X64-NEXT: vmovd %eax, %xmm0
-; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT: vmovd %r10d, %xmm0
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
@@ -2102,10 +2102,10 @@ define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i
;
; X64-LABEL: test_mm256_setr_epi8:
; X64: # %bb.0:
-; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; X64-NEXT: vmovd %eax, %xmm0
-; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT: vmovd %r10d, %xmm0
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll
index 4bcc2ed49cca5..33eb704788740 100644
--- a/llvm/test/CodeGen/X86/avx-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx-load-store.ll
@@ -9,9 +9,9 @@ define void @test_256_load(ptr nocapture %d, ptr nocapture %f, ptr nocapture %i)
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $96, %rsp
-; CHECK-NEXT: movq %rdx, %r14
-; CHECK-NEXT: movq %rsi, %r15
-; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rsi, %r14
+; CHECK-NEXT: movq %rdi, %r15
; CHECK-NEXT: vmovaps (%rdi), %ymm0
; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps (%rsi), %ymm1
@@ -20,11 +20,11 @@ define void @test_256_load(ptr nocapture %d, ptr nocapture %f, ptr nocapture %i)
; CHECK-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
; CHECK-NEXT: callq dummy at PLT
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: vmovaps %ymm0, (%rbx)
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; CHECK-NEXT: vmovaps %ymm0, (%r15)
-; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; CHECK-NEXT: vmovaps %ymm0, (%r14)
+; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vmovaps %ymm0, (%rbx)
; CHECK-NEXT: addq $96, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index 7a5baf15fe845..1b5b12308828f 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -435,17 +435,17 @@ define i32 @test12(i32 %a1, i32 %a2, i32 %b1) {
; ALL_X64-NEXT: .cfi_offset %rbx, -32
; ALL_X64-NEXT: .cfi_offset %r14, -24
; ALL_X64-NEXT: .cfi_offset %rbp, -16
-; ALL_X64-NEXT: movl %esi, %r14d
+; ALL_X64-NEXT: movl %esi, %ebx
; ALL_X64-NEXT: movl %edi, %ebp
; ALL_X64-NEXT: movl %edx, %esi
; ALL_X64-NEXT: callq _test11
-; ALL_X64-NEXT: movzbl %al, %ebx
+; ALL_X64-NEXT: movzbl %al, %r14d
; ALL_X64-NEXT: movl %ebp, %edi
-; ALL_X64-NEXT: movl %r14d, %esi
-; ALL_X64-NEXT: movl %ebx, %edx
+; ALL_X64-NEXT: movl %ebx, %esi
+; ALL_X64-NEXT: movl %r14d, %edx
; ALL_X64-NEXT: callq _test10
; ALL_X64-NEXT: xorl %ecx, %ecx
-; ALL_X64-NEXT: testb $1, %bl
+; ALL_X64-NEXT: testb $1, %r14b
; ALL_X64-NEXT: cmovel %ecx, %eax
; ALL_X64-NEXT: popq %rbx
; ALL_X64-NEXT: popq %r14
@@ -497,17 +497,17 @@ define i32 @test12(i32 %a1, i32 %a2, i32 %b1) {
; FASTISEL-NEXT: .cfi_offset %rbx, -32
; FASTISEL-NEXT: .cfi_offset %r14, -24
; FASTISEL-NEXT: .cfi_offset %rbp, -16
-; FASTISEL-NEXT: movl %esi, %r14d
+; FASTISEL-NEXT: movl %esi, %ebx
; FASTISEL-NEXT: movl %edi, %ebp
; FASTISEL-NEXT: movl %edx, %esi
; FASTISEL-NEXT: callq _test11
-; FASTISEL-NEXT: movzbl %al, %ebx
+; FASTISEL-NEXT: movzbl %al, %r14d
; FASTISEL-NEXT: movl %ebp, %edi
-; FASTISEL-NEXT: movl %r14d, %esi
-; FASTISEL-NEXT: movl %ebx, %edx
+; FASTISEL-NEXT: movl %ebx, %esi
+; FASTISEL-NEXT: movl %r14d, %edx
; FASTISEL-NEXT: callq _test10
; FASTISEL-NEXT: xorl %ecx, %ecx
-; FASTISEL-NEXT: testb $1, %bl
+; FASTISEL-NEXT: testb $1, %r14b
; FASTISEL-NEXT: cmovel %ecx, %eax
; FASTISEL-NEXT: popq %rbx
; FASTISEL-NEXT: popq %r14
@@ -910,84 +910,84 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; KNL-NEXT: kandw %k2, %k0, %k0
; KNL-NEXT: kmovw %r10d, %k2
; KNL-NEXT: kandw %k1, %k2, %k1
-; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftrw $1, %k0, %k1
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftrw $2, %k0, %k1
-; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: kmovw %k1, %esi
; KNL-NEXT: kshiftrw $3, %k0, %k1
-; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: kmovw %k1, %edi
; KNL-NEXT: kshiftrw $4, %k0, %k1
-; KNL-NEXT: kmovw %k1, %r12d
+; KNL-NEXT: kmovw %k1, %r9d
; KNL-NEXT: kshiftrw $5, %k0, %k1
-; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: kmovw %k1, %r8d
; KNL-NEXT: kshiftrw $6, %k0, %k1
-; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: kmovw %k1, %r10d
; KNL-NEXT: kshiftrw $7, %k0, %k1
-; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: kmovw %k1, %r11d
; KNL-NEXT: kshiftrw $8, %k0, %k1
; KNL-NEXT: kmovw %k1, %ebx
; KNL-NEXT: kshiftrw $9, %k0, %k1
-; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: kmovw %k1, %r14d
; KNL-NEXT: kshiftrw $10, %k0, %k1
; KNL-NEXT: kmovw %k1, %ebp
; KNL-NEXT: kshiftrw $11, %k0, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %r15d
; KNL-NEXT: kshiftrw $12, %k0, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %r12d
; KNL-NEXT: kshiftrw $13, %k0, %k1
-; KNL-NEXT: kmovw %k1, %edi
+; KNL-NEXT: kmovw %k1, %r13d
; KNL-NEXT: kshiftrw $14, %k0, %k1
-; KNL-NEXT: andl $1, %r8d
-; KNL-NEXT: movb %r8b, 2(%rax)
-; KNL-NEXT: kmovw %k0, %r8d
-; KNL-NEXT: andl $1, %r8d
-; KNL-NEXT: andl $1, %r9d
-; KNL-NEXT: leal (%r8,%r9,2), %r8d
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: andl $1, %ecx
+; KNL-NEXT: movb %cl, 2(%rax)
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: andl $1, %ecx
+; KNL-NEXT: andl $1, %edx
+; KNL-NEXT: leal (%rcx,%rdx,2), %ecx
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: andl $1, %esi
+; KNL-NEXT: leal (%rcx,%rsi,4), %ecx
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: andl $1, %edi
+; KNL-NEXT: leal (%rcx,%rdi,8), %ecx
+; KNL-NEXT: andl $1, %r9d
+; KNL-NEXT: shll $4, %r9d
+; KNL-NEXT: orl %ecx, %r9d
+; KNL-NEXT: andl $1, %r8d
+; KNL-NEXT: shll $5, %r8d
+; KNL-NEXT: orl %r9d, %r8d
; KNL-NEXT: andl $1, %r10d
-; KNL-NEXT: leal (%r8,%r10,4), %r8d
-; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: shll $6, %r10d
; KNL-NEXT: andl $1, %r11d
-; KNL-NEXT: leal (%r8,%r11,8), %r8d
-; KNL-NEXT: andl $1, %r12d
-; KNL-NEXT: shll $4, %r12d
-; KNL-NEXT: orl %r8d, %r12d
-; KNL-NEXT: andl $1, %r15d
-; KNL-NEXT: shll $5, %r15d
-; KNL-NEXT: orl %r12d, %r15d
-; KNL-NEXT: andl $1, %r14d
-; KNL-NEXT: shll $6, %r14d
-; KNL-NEXT: andl $1, %r13d
-; KNL-NEXT: shll $7, %r13d
-; KNL-NEXT: orl %r14d, %r13d
+; KNL-NEXT: shll $7, %r11d
+; KNL-NEXT: orl %r10d, %r11d
; KNL-NEXT: andl $1, %ebx
; KNL-NEXT: shll $8, %ebx
-; KNL-NEXT: orl %r13d, %ebx
-; KNL-NEXT: andl $1, %esi
-; KNL-NEXT: shll $9, %esi
-; KNL-NEXT: orl %ebx, %esi
+; KNL-NEXT: orl %r11d, %ebx
+; KNL-NEXT: andl $1, %r14d
+; KNL-NEXT: shll $9, %r14d
+; KNL-NEXT: orl %ebx, %r14d
; KNL-NEXT: andl $1, %ebp
; KNL-NEXT: shll $10, %ebp
-; KNL-NEXT: orl %esi, %ebp
-; KNL-NEXT: orl %r15d, %ebp
-; KNL-NEXT: andl $1, %ecx
-; KNL-NEXT: shll $11, %ecx
+; KNL-NEXT: orl %r14d, %ebp
+; KNL-NEXT: orl %r8d, %ebp
+; KNL-NEXT: andl $1, %r15d
+; KNL-NEXT: shll $11, %r15d
+; KNL-NEXT: andl $1, %r12d
+; KNL-NEXT: shll $12, %r12d
+; KNL-NEXT: orl %r15d, %r12d
+; KNL-NEXT: andl $1, %r13d
+; KNL-NEXT: shll $13, %r13d
+; KNL-NEXT: orl %r12d, %r13d
; KNL-NEXT: andl $1, %edx
-; KNL-NEXT: shll $12, %edx
-; KNL-NEXT: orl %ecx, %edx
-; KNL-NEXT: andl $1, %edi
-; KNL-NEXT: shll $13, %edi
-; KNL-NEXT: orl %edx, %edi
-; KNL-NEXT: andl $1, %r9d
-; KNL-NEXT: shll $14, %r9d
-; KNL-NEXT: orl %edi, %r9d
-; KNL-NEXT: andl $1, %r10d
-; KNL-NEXT: shll $15, %r10d
-; KNL-NEXT: orl %r9d, %r10d
-; KNL-NEXT: orl %ebp, %r10d
-; KNL-NEXT: movw %r10w, (%rax)
+; KNL-NEXT: shll $14, %edx
+; KNL-NEXT: orl %r13d, %edx
+; KNL-NEXT: andl $1, %esi
+; KNL-NEXT: shll $15, %esi
+; KNL-NEXT: orl %edx, %esi
+; KNL-NEXT: orl %ebp, %esi
+; KNL-NEXT: movw %si, (%rax)
; KNL-NEXT: popq %rbx
; KNL-NEXT: popq %r12
; KNL-NEXT: popq %r13
@@ -1223,84 +1223,84 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
; SKX-NEXT: kandd %k1, %k0, %k0
; SKX-NEXT: kshiftrd $16, %k0, %k1
-; SKX-NEXT: kmovd %k1, %r8d
+; SKX-NEXT: kmovd %k1, %ecx
; SKX-NEXT: kshiftrd $1, %k0, %k1
-; SKX-NEXT: kmovd %k1, %r9d
+; SKX-NEXT: kmovd %k1, %edx
; SKX-NEXT: kshiftrd $2, %k0, %k1
-; SKX-NEXT: kmovd %k1, %r10d
+; SKX-NEXT: kmovd %k1, %esi
; SKX-NEXT: kshiftrd $3, %k0, %k1
-; SKX-NEXT: kmovd %k1, %r11d
+; SKX-NEXT: kmovd %k1, %edi
; SKX-NEXT: kshiftrd $4, %k0, %k1
-; SKX-NEXT: kmovd %k1, %r12d
+; SKX-NEXT: kmovd %k1, %r9d
; SKX-NEXT: kshiftrd $5, %k0, %k1
-; SKX-NEXT: kmovd %k1, %r15d
+; SKX-NEXT: kmovd %k1, %r8d
; SKX-NEXT: kshiftrd $6, %k0, %k1
-; SKX-NEXT: kmovd %k1, %r14d
+; SKX-NEXT: kmovd %k1, %r10d
; SKX-NEXT: kshiftrd $7, %k0, %k1
-; SKX-NEXT: kmovd %k1, %r13d
+; SKX-NEXT: kmovd %k1, %r11d
; SKX-NEXT: kshiftrd $8, %k0, %k1
; SKX-NEXT: kmovd %k1, %ebx
; SKX-NEXT: kshiftrd $9, %k0, %k1
-; SKX-NEXT: kmovd %k1, %esi
+; SKX-NEXT: kmovd %k1, %r14d
; SKX-NEXT: kshiftrd $10, %k0, %k1
; SKX-NEXT: kmovd %k1, %ebp
; SKX-NEXT: kshiftrd $11, %k0, %k1
-; SKX-NEXT: kmovd %k1, %ecx
+; SKX-NEXT: kmovd %k1, %r15d
; SKX-NEXT: kshiftrd $12, %k0, %k1
-; SKX-NEXT: kmovd %k1, %edx
+; SKX-NEXT: kmovd %k1, %r12d
; SKX-NEXT: kshiftrd $13, %k0, %k1
-; SKX-NEXT: kmovd %k1, %edi
+; SKX-NEXT: kmovd %k1, %r13d
; SKX-NEXT: kshiftrd $14, %k0, %k1
-; SKX-NEXT: andl $1, %r8d
-; SKX-NEXT: movb %r8b, 2(%rax)
-; SKX-NEXT: kmovd %k0, %r8d
-; SKX-NEXT: andl $1, %r8d
-; SKX-NEXT: andl $1, %r9d
-; SKX-NEXT: leal (%r8,%r9,2), %r8d
-; SKX-NEXT: kmovd %k1, %r9d
+; SKX-NEXT: andl $1, %ecx
+; SKX-NEXT: movb %cl, 2(%rax)
+; SKX-NEXT: kmovd %k0, %ecx
+; SKX-NEXT: andl $1, %ecx
+; SKX-NEXT: andl $1, %edx
+; SKX-NEXT: leal (%rcx,%rdx,2), %ecx
+; SKX-NEXT: kmovd %k1, %edx
; SKX-NEXT: kshiftrd $15, %k0, %k0
+; SKX-NEXT: andl $1, %esi
+; SKX-NEXT: leal (%rcx,%rsi,4), %ecx
+; SKX-NEXT: kmovd %k0, %esi
+; SKX-NEXT: andl $1, %edi
+; SKX-NEXT: leal (%rcx,%rdi,8), %ecx
+; SKX-NEXT: andl $1, %r9d
+; SKX-NEXT: shll $4, %r9d
+; SKX-NEXT: orl %ecx, %r9d
+; SKX-NEXT: andl $1, %r8d
+; SKX-NEXT: shll $5, %r8d
+; SKX-NEXT: orl %r9d, %r8d
; SKX-NEXT: andl $1, %r10d
-; SKX-NEXT: leal (%r8,%r10,4), %r8d
-; SKX-NEXT: kmovd %k0, %r10d
+; SKX-NEXT: shll $6, %r10d
; SKX-NEXT: andl $1, %r11d
-; SKX-NEXT: leal (%r8,%r11,8), %r8d
-; SKX-NEXT: andl $1, %r12d
-; SKX-NEXT: shll $4, %r12d
-; SKX-NEXT: orl %r8d, %r12d
-; SKX-NEXT: andl $1, %r15d
-; SKX-NEXT: shll $5, %r15d
-; SKX-NEXT: orl %r12d, %r15d
-; SKX-NEXT: andl $1, %r14d
-; SKX-NEXT: shll $6, %r14d
-; SKX-NEXT: andl $1, %r13d
-; SKX-NEXT: shll $7, %r13d
-; SKX-NEXT: orl %r14d, %r13d
+; SKX-NEXT: shll $7, %r11d
+; SKX-NEXT: orl %r10d, %r11d
; SKX-NEXT: andl $1, %ebx
; SKX-NEXT: shll $8, %ebx
-; SKX-NEXT: orl %r13d, %ebx
-; SKX-NEXT: andl $1, %esi
-; SKX-NEXT: shll $9, %esi
-; SKX-NEXT: orl %ebx, %esi
+; SKX-NEXT: orl %r11d, %ebx
+; SKX-NEXT: andl $1, %r14d
+; SKX-NEXT: shll $9, %r14d
+; SKX-NEXT: orl %ebx, %r14d
; SKX-NEXT: andl $1, %ebp
; SKX-NEXT: shll $10, %ebp
-; SKX-NEXT: orl %esi, %ebp
-; SKX-NEXT: orl %r15d, %ebp
-; SKX-NEXT: andl $1, %ecx
-; SKX-NEXT: shll $11, %ecx
+; SKX-NEXT: orl %r14d, %ebp
+; SKX-NEXT: orl %r8d, %ebp
+; SKX-NEXT: andl $1, %r15d
+; SKX-NEXT: shll $11, %r15d
+; SKX-NEXT: andl $1, %r12d
+; SKX-NEXT: shll $12, %r12d
+; SKX-NEXT: orl %r15d, %r12d
+; SKX-NEXT: andl $1, %r13d
+; SKX-NEXT: shll $13, %r13d
+; SKX-NEXT: orl %r12d, %r13d
; SKX-NEXT: andl $1, %edx
-; SKX-NEXT: shll $12, %edx
-; SKX-NEXT: orl %ecx, %edx
-; SKX-NEXT: andl $1, %edi
-; SKX-NEXT: shll $13, %edi
-; SKX-NEXT: orl %edx, %edi
-; SKX-NEXT: andl $1, %r9d
-; SKX-NEXT: shll $14, %r9d
-; SKX-NEXT: orl %edi, %r9d
-; SKX-NEXT: andl $1, %r10d
-; SKX-NEXT: shll $15, %r10d
-; SKX-NEXT: orl %r9d, %r10d
-; SKX-NEXT: orl %ebp, %r10d
-; SKX-NEXT: movw %r10w, (%rax)
+; SKX-NEXT: shll $14, %edx
+; SKX-NEXT: orl %r13d, %edx
+; SKX-NEXT: andl $1, %esi
+; SKX-NEXT: shll $15, %esi
+; SKX-NEXT: orl %edx, %esi
+; SKX-NEXT: orl %ebp, %esi
+; SKX-NEXT: movw %si, (%rax)
; SKX-NEXT: popq %rbx
; SKX-NEXT: popq %r12
; SKX-NEXT: popq %r13
@@ -1864,84 +1864,84 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
; FASTISEL-NEXT: kandd %k1, %k0, %k0
; FASTISEL-NEXT: kshiftrd $16, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %r8d
+; FASTISEL-NEXT: kmovd %k1, %ecx
; FASTISEL-NEXT: kshiftrd $1, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %r9d
+; FASTISEL-NEXT: kmovd %k1, %edx
; FASTISEL-NEXT: kshiftrd $2, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %r10d
+; FASTISEL-NEXT: kmovd %k1, %esi
; FASTISEL-NEXT: kshiftrd $3, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %r11d
+; FASTISEL-NEXT: kmovd %k1, %edi
; FASTISEL-NEXT: kshiftrd $4, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %r12d
+; FASTISEL-NEXT: kmovd %k1, %r9d
; FASTISEL-NEXT: kshiftrd $5, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %r15d
+; FASTISEL-NEXT: kmovd %k1, %r8d
; FASTISEL-NEXT: kshiftrd $6, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %r14d
+; FASTISEL-NEXT: kmovd %k1, %r10d
; FASTISEL-NEXT: kshiftrd $7, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %r13d
+; FASTISEL-NEXT: kmovd %k1, %r11d
; FASTISEL-NEXT: kshiftrd $8, %k0, %k1
; FASTISEL-NEXT: kmovd %k1, %ebx
; FASTISEL-NEXT: kshiftrd $9, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %esi
+; FASTISEL-NEXT: kmovd %k1, %r14d
; FASTISEL-NEXT: kshiftrd $10, %k0, %k1
; FASTISEL-NEXT: kmovd %k1, %ebp
; FASTISEL-NEXT: kshiftrd $11, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %ecx
+; FASTISEL-NEXT: kmovd %k1, %r15d
; FASTISEL-NEXT: kshiftrd $12, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %edx
+; FASTISEL-NEXT: kmovd %k1, %r12d
; FASTISEL-NEXT: kshiftrd $13, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %edi
+; FASTISEL-NEXT: kmovd %k1, %r13d
; FASTISEL-NEXT: kshiftrd $14, %k0, %k1
-; FASTISEL-NEXT: andl $1, %r8d
-; FASTISEL-NEXT: movb %r8b, 2(%rax)
-; FASTISEL-NEXT: kmovd %k0, %r8d
-; FASTISEL-NEXT: andl $1, %r8d
-; FASTISEL-NEXT: andl $1, %r9d
-; FASTISEL-NEXT: leal (%r8,%r9,2), %r8d
-; FASTISEL-NEXT: kmovd %k1, %r9d
+; FASTISEL-NEXT: andl $1, %ecx
+; FASTISEL-NEXT: movb %cl, 2(%rax)
+; FASTISEL-NEXT: kmovd %k0, %ecx
+; FASTISEL-NEXT: andl $1, %ecx
+; FASTISEL-NEXT: andl $1, %edx
+; FASTISEL-NEXT: leal (%rcx,%rdx,2), %ecx
+; FASTISEL-NEXT: kmovd %k1, %edx
; FASTISEL-NEXT: kshiftrd $15, %k0, %k0
+; FASTISEL-NEXT: andl $1, %esi
+; FASTISEL-NEXT: leal (%rcx,%rsi,4), %ecx
+; FASTISEL-NEXT: kmovd %k0, %esi
+; FASTISEL-NEXT: andl $1, %edi
+; FASTISEL-NEXT: leal (%rcx,%rdi,8), %ecx
+; FASTISEL-NEXT: andl $1, %r9d
+; FASTISEL-NEXT: shll $4, %r9d
+; FASTISEL-NEXT: orl %ecx, %r9d
+; FASTISEL-NEXT: andl $1, %r8d
+; FASTISEL-NEXT: shll $5, %r8d
+; FASTISEL-NEXT: orl %r9d, %r8d
; FASTISEL-NEXT: andl $1, %r10d
-; FASTISEL-NEXT: leal (%r8,%r10,4), %r8d
-; FASTISEL-NEXT: kmovd %k0, %r10d
+; FASTISEL-NEXT: shll $6, %r10d
; FASTISEL-NEXT: andl $1, %r11d
-; FASTISEL-NEXT: leal (%r8,%r11,8), %r8d
-; FASTISEL-NEXT: andl $1, %r12d
-; FASTISEL-NEXT: shll $4, %r12d
-; FASTISEL-NEXT: orl %r8d, %r12d
-; FASTISEL-NEXT: andl $1, %r15d
-; FASTISEL-NEXT: shll $5, %r15d
-; FASTISEL-NEXT: orl %r12d, %r15d
-; FASTISEL-NEXT: andl $1, %r14d
-; FASTISEL-NEXT: shll $6, %r14d
-; FASTISEL-NEXT: andl $1, %r13d
-; FASTISEL-NEXT: shll $7, %r13d
-; FASTISEL-NEXT: orl %r14d, %r13d
+; FASTISEL-NEXT: shll $7, %r11d
+; FASTISEL-NEXT: orl %r10d, %r11d
; FASTISEL-NEXT: andl $1, %ebx
; FASTISEL-NEXT: shll $8, %ebx
-; FASTISEL-NEXT: orl %r13d, %ebx
-; FASTISEL-NEXT: andl $1, %esi
-; FASTISEL-NEXT: shll $9, %esi
-; FASTISEL-NEXT: orl %ebx, %esi
+; FASTISEL-NEXT: orl %r11d, %ebx
+; FASTISEL-NEXT: andl $1, %r14d
+; FASTISEL-NEXT: shll $9, %r14d
+; FASTISEL-NEXT: orl %ebx, %r14d
; FASTISEL-NEXT: andl $1, %ebp
; FASTISEL-NEXT: shll $10, %ebp
-; FASTISEL-NEXT: orl %esi, %ebp
-; FASTISEL-NEXT: orl %r15d, %ebp
-; FASTISEL-NEXT: andl $1, %ecx
-; FASTISEL-NEXT: shll $11, %ecx
+; FASTISEL-NEXT: orl %r14d, %ebp
+; FASTISEL-NEXT: orl %r8d, %ebp
+; FASTISEL-NEXT: andl $1, %r15d
+; FASTISEL-NEXT: shll $11, %r15d
+; FASTISEL-NEXT: andl $1, %r12d
+; FASTISEL-NEXT: shll $12, %r12d
+; FASTISEL-NEXT: orl %r15d, %r12d
+; FASTISEL-NEXT: andl $1, %r13d
+; FASTISEL-NEXT: shll $13, %r13d
+; FASTISEL-NEXT: orl %r12d, %r13d
; FASTISEL-NEXT: andl $1, %edx
-; FASTISEL-NEXT: shll $12, %edx
-; FASTISEL-NEXT: orl %ecx, %edx
-; FASTISEL-NEXT: andl $1, %edi
-; FASTISEL-NEXT: shll $13, %edi
-; FASTISEL-NEXT: orl %edx, %edi
-; FASTISEL-NEXT: andl $1, %r9d
-; FASTISEL-NEXT: shll $14, %r9d
-; FASTISEL-NEXT: orl %edi, %r9d
-; FASTISEL-NEXT: andl $1, %r10d
-; FASTISEL-NEXT: shll $15, %r10d
-; FASTISEL-NEXT: orl %r9d, %r10d
-; FASTISEL-NEXT: orl %ebp, %r10d
-; FASTISEL-NEXT: movw %r10w, (%rax)
+; FASTISEL-NEXT: shll $14, %edx
+; FASTISEL-NEXT: orl %r13d, %edx
+; FASTISEL-NEXT: andl $1, %esi
+; FASTISEL-NEXT: shll $15, %esi
+; FASTISEL-NEXT: orl %edx, %esi
+; FASTISEL-NEXT: orl %ebp, %esi
+; FASTISEL-NEXT: movw %si, (%rax)
; FASTISEL-NEXT: popq %rbx
; FASTISEL-NEXT: popq %r12
; FASTISEL-NEXT: popq %r13
@@ -2045,13 +2045,13 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
; KNL-NEXT: kshiftrw $9, %k6, %k6
; KNL-NEXT: korw %k6, %k0, %k0
; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
-; KNL-NEXT: andl $1, %r10d
; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: andl $1, %edi
+; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; KNL-NEXT: kmovw %r10d, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k0
; KNL-NEXT: kshiftrw $14, %k0, %k0
-; KNL-NEXT: kmovw %r10d, %k6
+; KNL-NEXT: kmovw %edi, %k6
; KNL-NEXT: korw %k0, %k6, %k0
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
@@ -2084,13 +2084,13 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
; KNL-NEXT: kshiftrw $9, %k6, %k6
; KNL-NEXT: korw %k6, %k0, %k0
; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
-; KNL-NEXT: andl $1, %r10d
; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: andl $1, %edi
+; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; KNL-NEXT: kmovw %r10d, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k0
; KNL-NEXT: kshiftrw $14, %k0, %k0
-; KNL-NEXT: kmovw %r10d, %k6
+; KNL-NEXT: kmovw %edi, %k6
; KNL-NEXT: korw %k0, %k6, %k0
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
@@ -2123,13 +2123,13 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
; KNL-NEXT: kshiftrw $9, %k6, %k6
; KNL-NEXT: korw %k6, %k0, %k0
; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
-; KNL-NEXT: andl $1, %r10d
; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: andl $1, %edi
+; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; KNL-NEXT: kmovw %r10d, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k0
; KNL-NEXT: kshiftrw $14, %k0, %k0
-; KNL-NEXT: kmovw %r10d, %k6
+; KNL-NEXT: kmovw %edi, %k6
; KNL-NEXT: korw %k0, %k6, %k0
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
@@ -2162,13 +2162,13 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
; KNL-NEXT: kshiftrw $9, %k6, %k6
; KNL-NEXT: korw %k6, %k0, %k0
; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
-; KNL-NEXT: andl $1, %r10d
; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: andl $1, %edi
+; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; KNL-NEXT: kmovw %r10d, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k0
; KNL-NEXT: kshiftrw $14, %k0, %k0
-; KNL-NEXT: kmovw %r10d, %k6
+; KNL-NEXT: kmovw %edi, %k6
; KNL-NEXT: korw %k0, %k6, %k0
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
@@ -2201,13 +2201,13 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
; KNL-NEXT: kshiftrw $9, %k6, %k6
; KNL-NEXT: korw %k6, %k0, %k0
; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
-; KNL-NEXT: andl $1, %r10d
; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: andl $1, %edi
+; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; KNL-NEXT: kmovw %r10d, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k0
; KNL-NEXT: kshiftrw $14, %k0, %k0
-; KNL-NEXT: kmovw %r10d, %k6
+; KNL-NEXT: kmovw %edi, %k6
; KNL-NEXT: korw %k0, %k6, %k0
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
@@ -2327,38 +2327,38 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: kshiftrw $6, %k0, %k1
-; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftrw $5, %k0, %k1
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftrw $4, %k0, %k1
-; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: kmovw %k1, %esi
; KNL-NEXT: kshiftrw $3, %k0, %k1
; KNL-NEXT: kmovw %k1, %edi
; KNL-NEXT: kshiftrw $2, %k0, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %r8d
; KNL-NEXT: kshiftrw $1, %k0, %k1
-; KNL-NEXT: kmovw %k1, %edx
-; KNL-NEXT: kmovw %k0, %esi
-; KNL-NEXT: andb $1, %sil
-; KNL-NEXT: andb $1, %dl
-; KNL-NEXT: addb %dl, %dl
-; KNL-NEXT: orb %sil, %dl
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: shlb $2, %cl
-; KNL-NEXT: orb %dl, %cl
-; KNL-NEXT: andb $1, %dil
-; KNL-NEXT: shlb $3, %dil
-; KNL-NEXT: orb %cl, %dil
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kmovw %k0, %r10d
; KNL-NEXT: andb $1, %r10b
-; KNL-NEXT: shlb $4, %r10b
-; KNL-NEXT: orb %dil, %r10b
; KNL-NEXT: andb $1, %r9b
-; KNL-NEXT: shlb $5, %r9b
+; KNL-NEXT: addb %r9b, %r9b
; KNL-NEXT: orb %r10b, %r9b
-; KNL-NEXT: shlb $6, %r8b
+; KNL-NEXT: andb $1, %r8b
+; KNL-NEXT: shlb $2, %r8b
; KNL-NEXT: orb %r9b, %r8b
-; KNL-NEXT: andb $127, %r8b
-; KNL-NEXT: movb %r8b, (%rax)
+; KNL-NEXT: andb $1, %dil
+; KNL-NEXT: shlb $3, %dil
+; KNL-NEXT: orb %r8b, %dil
+; KNL-NEXT: andb $1, %sil
+; KNL-NEXT: shlb $4, %sil
+; KNL-NEXT: orb %dil, %sil
+; KNL-NEXT: andb $1, %dl
+; KNL-NEXT: shlb $5, %dl
+; KNL-NEXT: orb %sil, %dl
+; KNL-NEXT: shlb $6, %cl
+; KNL-NEXT: orb %dl, %cl
+; KNL-NEXT: andb $127, %cl
+; KNL-NEXT: movb %cl, (%rax)
; KNL-NEXT: retq
;
; SKX-LABEL: test17:
@@ -2705,38 +2705,38 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
; SKX-NEXT: kandb %k1, %k0, %k0
; SKX-NEXT: kshiftrb $6, %k0, %k1
-; SKX-NEXT: kmovd %k1, %r8d
+; SKX-NEXT: kmovd %k1, %ecx
; SKX-NEXT: kshiftrb $5, %k0, %k1
-; SKX-NEXT: kmovd %k1, %r9d
+; SKX-NEXT: kmovd %k1, %edx
; SKX-NEXT: kshiftrb $4, %k0, %k1
-; SKX-NEXT: kmovd %k1, %r10d
+; SKX-NEXT: kmovd %k1, %esi
; SKX-NEXT: kshiftrb $3, %k0, %k1
; SKX-NEXT: kmovd %k1, %edi
; SKX-NEXT: kshiftrb $2, %k0, %k1
-; SKX-NEXT: kmovd %k1, %ecx
+; SKX-NEXT: kmovd %k1, %r8d
; SKX-NEXT: kshiftrb $1, %k0, %k1
-; SKX-NEXT: kmovd %k1, %edx
-; SKX-NEXT: kmovd %k0, %esi
-; SKX-NEXT: andb $1, %sil
-; SKX-NEXT: andb $1, %dl
-; SKX-NEXT: addb %dl, %dl
-; SKX-NEXT: orb %sil, %dl
-; SKX-NEXT: andb $1, %cl
-; SKX-NEXT: shlb $2, %cl
-; SKX-NEXT: orb %dl, %cl
-; SKX-NEXT: andb $1, %dil
-; SKX-NEXT: shlb $3, %dil
-; SKX-NEXT: orb %cl, %dil
+; SKX-NEXT: kmovd %k1, %r9d
+; SKX-NEXT: kmovd %k0, %r10d
; SKX-NEXT: andb $1, %r10b
-; SKX-NEXT: shlb $4, %r10b
-; SKX-NEXT: orb %dil, %r10b
; SKX-NEXT: andb $1, %r9b
-; SKX-NEXT: shlb $5, %r9b
+; SKX-NEXT: addb %r9b, %r9b
; SKX-NEXT: orb %r10b, %r9b
-; SKX-NEXT: shlb $6, %r8b
+; SKX-NEXT: andb $1, %r8b
+; SKX-NEXT: shlb $2, %r8b
; SKX-NEXT: orb %r9b, %r8b
-; SKX-NEXT: andb $127, %r8b
-; SKX-NEXT: movb %r8b, (%rax)
+; SKX-NEXT: andb $1, %dil
+; SKX-NEXT: shlb $3, %dil
+; SKX-NEXT: orb %r8b, %dil
+; SKX-NEXT: andb $1, %sil
+; SKX-NEXT: shlb $4, %sil
+; SKX-NEXT: orb %dil, %sil
+; SKX-NEXT: andb $1, %dl
+; SKX-NEXT: shlb $5, %dl
+; SKX-NEXT: orb %sil, %dl
+; SKX-NEXT: shlb $6, %cl
+; SKX-NEXT: orb %dl, %cl
+; SKX-NEXT: andb $127, %cl
+; SKX-NEXT: movb %cl, (%rax)
; SKX-NEXT: retq
;
; KNL_X32-LABEL: test17:
@@ -3494,38 +3494,38 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
; FASTISEL-NEXT: kandb %k1, %k0, %k0
; FASTISEL-NEXT: kshiftrb $6, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %r8d
+; FASTISEL-NEXT: kmovd %k1, %ecx
; FASTISEL-NEXT: kshiftrb $5, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %r9d
+; FASTISEL-NEXT: kmovd %k1, %edx
; FASTISEL-NEXT: kshiftrb $4, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %r10d
+; FASTISEL-NEXT: kmovd %k1, %esi
; FASTISEL-NEXT: kshiftrb $3, %k0, %k1
; FASTISEL-NEXT: kmovd %k1, %edi
; FASTISEL-NEXT: kshiftrb $2, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %ecx
+; FASTISEL-NEXT: kmovd %k1, %r8d
; FASTISEL-NEXT: kshiftrb $1, %k0, %k1
-; FASTISEL-NEXT: kmovd %k1, %edx
-; FASTISEL-NEXT: kmovd %k0, %esi
-; FASTISEL-NEXT: andb $1, %sil
-; FASTISEL-NEXT: andb $1, %dl
-; FASTISEL-NEXT: addb %dl, %dl
-; FASTISEL-NEXT: orb %sil, %dl
-; FASTISEL-NEXT: andb $1, %cl
-; FASTISEL-NEXT: shlb $2, %cl
-; FASTISEL-NEXT: orb %dl, %cl
-; FASTISEL-NEXT: andb $1, %dil
-; FASTISEL-NEXT: shlb $3, %dil
-; FASTISEL-NEXT: orb %cl, %dil
+; FASTISEL-NEXT: kmovd %k1, %r9d
+; FASTISEL-NEXT: kmovd %k0, %r10d
; FASTISEL-NEXT: andb $1, %r10b
-; FASTISEL-NEXT: shlb $4, %r10b
-; FASTISEL-NEXT: orb %dil, %r10b
; FASTISEL-NEXT: andb $1, %r9b
-; FASTISEL-NEXT: shlb $5, %r9b
+; FASTISEL-NEXT: addb %r9b, %r9b
; FASTISEL-NEXT: orb %r10b, %r9b
-; FASTISEL-NEXT: shlb $6, %r8b
+; FASTISEL-NEXT: andb $1, %r8b
+; FASTISEL-NEXT: shlb $2, %r8b
; FASTISEL-NEXT: orb %r9b, %r8b
-; FASTISEL-NEXT: andb $127, %r8b
-; FASTISEL-NEXT: movb %r8b, (%rax)
+; FASTISEL-NEXT: andb $1, %dil
+; FASTISEL-NEXT: shlb $3, %dil
+; FASTISEL-NEXT: orb %r8b, %dil
+; FASTISEL-NEXT: andb $1, %sil
+; FASTISEL-NEXT: shlb $4, %sil
+; FASTISEL-NEXT: orb %dil, %sil
+; FASTISEL-NEXT: andb $1, %dl
+; FASTISEL-NEXT: shlb $5, %dl
+; FASTISEL-NEXT: orb %sil, %dl
+; FASTISEL-NEXT: shlb $6, %cl
+; FASTISEL-NEXT: orb %dl, %cl
+; FASTISEL-NEXT: andb $127, %cl
+; FASTISEL-NEXT: movb %cl, (%rax)
; FASTISEL-NEXT: retq
%j = and <7 x i1> %a, %b
%k = and <7 x i1> %j, %c
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
index f57449578f70e..0caa8826e75c8 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -986,11 +986,8 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
;
; WIN64-LABEL: testi32_inp:
; WIN64: # %bb.0:
-; WIN64-NEXT: pushq %r13
-; WIN64-NEXT: pushq %rbp
; WIN64-NEXT: pushq %rbx
; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
-; WIN64-NEXT: movl %ecx, %ebx
; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15
; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14
@@ -1000,45 +997,40 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9
; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8
; WIN64-NEXT: # kill: def $edi killed $edi def $rdi
-; WIN64-NEXT: leal (%rdx,%rdi), %r13d
+; WIN64-NEXT: leal (%rdx,%rdi), %ebx
; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx
; WIN64-NEXT: subl %edi, %edx
-; WIN64-NEXT: leal (%rsi,%r8), %ecx
+; WIN64-NEXT: leal (%rsi,%r8), %edi
; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi
; WIN64-NEXT: subl %r8d, %esi
; WIN64-NEXT: leal (%r9,%r10), %r8d
-; WIN64-NEXT: movl %r9d, %ebp
-; WIN64-NEXT: subl %r10d, %ebp
-; WIN64-NEXT: movl %eax, %edi
-; WIN64-NEXT: movl %ebx, %r9d
-; WIN64-NEXT: subl %ebx, %edi
-; WIN64-NEXT: imull %edi, %ebp
-; WIN64-NEXT: leal (%r11,%r12), %edi
-; WIN64-NEXT: movl %r11d, %ebx
-; WIN64-NEXT: subl %r12d, %ebx
-; WIN64-NEXT: imull %edx, %ebx
-; WIN64-NEXT: addl %ebp, %ebx
+; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9
+; WIN64-NEXT: subl %r10d, %r9d
+; WIN64-NEXT: movl %eax, %r10d
+; WIN64-NEXT: subl %ecx, %r10d
+; WIN64-NEXT: imull %r10d, %r9d
+; WIN64-NEXT: leal (%r11,%r12), %r10d
+; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11
+; WIN64-NEXT: subl %r12d, %r11d
+; WIN64-NEXT: imull %edx, %r11d
+; WIN64-NEXT: addl %r9d, %r11d
; WIN64-NEXT: leal (%r14,%r15), %edx
-; WIN64-NEXT: movl %r14d, %ebp
-; WIN64-NEXT: subl %r15d, %ebp
-; WIN64-NEXT: imull %esi, %ebp
-; WIN64-NEXT: addl %ebx, %ebp
-; WIN64-NEXT: addl %r9d, %eax
+; WIN64-NEXT: movl %r14d, %r9d
+; WIN64-NEXT: subl %r15d, %r9d
+; WIN64-NEXT: imull %esi, %r9d
+; WIN64-NEXT: addl %r11d, %r9d
+; WIN64-NEXT: addl %ecx, %eax
; WIN64-NEXT: imull %r8d, %eax
-; WIN64-NEXT: imull %r13d, %edi
-; WIN64-NEXT: addl %edi, %eax
-; WIN64-NEXT: imull %ecx, %edx
+; WIN64-NEXT: imull %ebx, %r10d
+; WIN64-NEXT: addl %r10d, %eax
+; WIN64-NEXT: imull %edi, %edx
; WIN64-NEXT: addl %edx, %eax
-; WIN64-NEXT: addl %ebp, %eax
+; WIN64-NEXT: addl %r9d, %eax
; WIN64-NEXT: popq %rbx
-; WIN64-NEXT: popq %rbp
-; WIN64-NEXT: popq %r13
; WIN64-NEXT: retq
;
; LINUXOSX64-LABEL: testi32_inp:
; LINUXOSX64: # %bb.0:
-; LINUXOSX64-NEXT: pushq %rbp
-; LINUXOSX64-NEXT: pushq %rbx
; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx
; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX64-NEXT: # kill: def $r14d killed $r14d def $r14
@@ -1048,37 +1040,35 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
; LINUXOSX64-NEXT: # kill: def $r8d killed $r8d def $r8
; LINUXOSX64-NEXT: # kill: def $edi killed $edi def $rdi
; LINUXOSX64-NEXT: leal (%rdx,%rdi), %r10d
-; LINUXOSX64-NEXT: movl %edx, %ebp
-; LINUXOSX64-NEXT: subl %edi, %ebp
-; LINUXOSX64-NEXT: leal (%rsi,%r8), %r11d
+; LINUXOSX64-NEXT: # kill: def $edx killed $edx killed $rdx
+; LINUXOSX64-NEXT: subl %edi, %edx
+; LINUXOSX64-NEXT: leal (%rsi,%r8), %edi
; LINUXOSX64-NEXT: # kill: def $esi killed $esi killed $rsi
; LINUXOSX64-NEXT: subl %r8d, %esi
; LINUXOSX64-NEXT: leal (%r9,%r12), %r8d
-; LINUXOSX64-NEXT: movl %r9d, %edi
-; LINUXOSX64-NEXT: subl %r12d, %edi
-; LINUXOSX64-NEXT: movl %eax, %edx
-; LINUXOSX64-NEXT: subl %ecx, %edx
-; LINUXOSX64-NEXT: imull %edx, %edi
-; LINUXOSX64-NEXT: leal (%r13,%r14), %edx
-; LINUXOSX64-NEXT: movl %r13d, %ebx
-; LINUXOSX64-NEXT: subl %r14d, %ebx
-; LINUXOSX64-NEXT: imull %ebp, %ebx
-; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %ebp
-; LINUXOSX64-NEXT: addl %edi, %ebx
-; LINUXOSX64-NEXT: movl %r15d, %edi
-; LINUXOSX64-NEXT: subl %ebp, %edi
-; LINUXOSX64-NEXT: imull %esi, %edi
-; LINUXOSX64-NEXT: addl %ebx, %edi
+; LINUXOSX64-NEXT: # kill: def $r9d killed $r9d killed $r9
+; LINUXOSX64-NEXT: subl %r12d, %r9d
+; LINUXOSX64-NEXT: movl %eax, %r11d
+; LINUXOSX64-NEXT: subl %ecx, %r11d
+; LINUXOSX64-NEXT: imull %r11d, %r9d
+; LINUXOSX64-NEXT: leal (%r13,%r14), %r11d
+; LINUXOSX64-NEXT: movl %r13d, %r12d
+; LINUXOSX64-NEXT: subl %r14d, %r12d
+; LINUXOSX64-NEXT: imull %edx, %r12d
+; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; LINUXOSX64-NEXT: addl %r9d, %r12d
+; LINUXOSX64-NEXT: movl %r15d, %r9d
+; LINUXOSX64-NEXT: subl %edx, %r9d
+; LINUXOSX64-NEXT: imull %esi, %r9d
+; LINUXOSX64-NEXT: addl %r12d, %r9d
; LINUXOSX64-NEXT: addl %ecx, %eax
; LINUXOSX64-NEXT: imull %r8d, %eax
-; LINUXOSX64-NEXT: imull %r10d, %edx
+; LINUXOSX64-NEXT: imull %r10d, %r11d
+; LINUXOSX64-NEXT: addl %r11d, %eax
+; LINUXOSX64-NEXT: addl %r15d, %edx
+; LINUXOSX64-NEXT: imull %edi, %edx
; LINUXOSX64-NEXT: addl %edx, %eax
-; LINUXOSX64-NEXT: addl %r15d, %ebp
-; LINUXOSX64-NEXT: imull %r11d, %ebp
-; LINUXOSX64-NEXT: addl %ebp, %eax
-; LINUXOSX64-NEXT: addl %edi, %eax
-; LINUXOSX64-NEXT: popq %rbx
-; LINUXOSX64-NEXT: popq %rbp
+; LINUXOSX64-NEXT: addl %r9d, %eax
; LINUXOSX64-NEXT: retq
%x1 = sub i32 %a1, %a2
%x2 = sub i32 %a3, %a4
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index 4f69c9a676a08..ae710cc40a522 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -4846,7 +4846,7 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
; X64-LABEL: test_cmp_b_256:
; X64: # %bb.0:
; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
-; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0]
+; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0]
; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02]
@@ -4856,15 +4856,15 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
; X64-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x05]
; X64-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
; X64-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1]
-; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0]
; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
; X64-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
-; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x02]
+; X64-NEXT: vpinsrd $2, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x02]
; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
; X64-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
; X64-NEXT: # xmm0 = xmm0[0,1,2],xmm1[3]
; X64-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9]
-; X64-NEXT: vmovd %r8d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0]
+; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
; X64-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
; X64-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
@@ -4946,26 +4946,26 @@ define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
-; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0]
+; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0]
; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02]
-; X64-NEXT: kmovd %k0, %r9d # encoding: [0xc5,0x7b,0x93,0xc8]
+; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
; X64-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
; X64-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
; X64-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x05]
-; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0]
; X64-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
-; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X64-NEXT: kmovd %k0, %r9d # encoding: [0xc5,0x7b,0x93,0xc8]
; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
-; X64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; X64-NEXT: vpinsrd $1, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x01]
+; X64-NEXT: vpinsrd $2, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc1,0x02]
; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
; X64-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9]
-; X64-NEXT: vmovd %r8d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0]
+; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
; X64-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
; X64-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT: vmovd %r9d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd1]
+; X64-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
; X64-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
; X64-NEXT: # xmm1 = xmm1[0],xmm2[0]
; X64-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
@@ -5040,7 +5040,7 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
; X64-LABEL: test_ucmp_b_256:
; X64: # %bb.0:
; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
-; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0]
+; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01]
; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02]
@@ -5050,15 +5050,15 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
; X64-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05]
; X64-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
; X64-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x06]
-; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0]
; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
; X64-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
-; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x02]
+; X64-NEXT: vpinsrd $2, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x02]
; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
; X64-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
; X64-NEXT: # xmm0 = xmm0[0,1,2],xmm1[3]
; X64-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9]
-; X64-NEXT: vmovd %r8d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0]
+; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
; X64-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
; X64-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
@@ -5140,26 +5140,26 @@ define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask)
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
-; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0]
+; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01]
; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02]
-; X64-NEXT: kmovd %k0, %r9d # encoding: [0xc5,0x7b,0x93,0xc8]
+; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
; X64-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
; X64-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
; X64-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
-; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0]
; X64-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
-; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X64-NEXT: kmovd %k0, %r9d # encoding: [0xc5,0x7b,0x93,0xc8]
; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
-; X64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; X64-NEXT: vpinsrd $1, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x01]
+; X64-NEXT: vpinsrd $2, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc1,0x02]
; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
; X64-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9]
-; X64-NEXT: vmovd %r8d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0]
+; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
; X64-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
; X64-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT: vmovd %r9d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd1]
+; X64-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
; X64-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
; X64-NEXT: # xmm1 = xmm1[0],xmm2[0]
; X64-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 391833d6bd84e..20b095f3b74f8 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -49,8 +49,8 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: movq %rdx, %r14
-; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rsi, %r14
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq __truncdfbf2 at PLT
; CHECK-NEXT: movd %xmm0, %ebp
@@ -67,7 +67,7 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: cvtss2sd %xmm0, %xmm0
-; CHECK-NEXT: movsd %xmm0, (%r14)
+; CHECK-NEXT: movsd %xmm0, (%rbx)
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %rbp
@@ -207,63 +207,63 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; CHECK-NEXT: shrq $48, %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-NEXT: movq %xmm0, %rbx
-; CHECK-NEXT: movq %rbx, %rax
+; CHECK-NEXT: movq %xmm0, %r12
+; CHECK-NEXT: movq %r12, %rax
; CHECK-NEXT: shrq $32, %rax
; CHECK-NEXT: movq %rax, (%rsp) # 8-byte Spill
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; CHECK-NEXT: movq %xmm0, %rbp
-; CHECK-NEXT: movq %rbp, %r15
-; CHECK-NEXT: shrq $32, %r15
-; CHECK-NEXT: movq %rbx, %r13
+; CHECK-NEXT: movq %xmm0, %r14
+; CHECK-NEXT: movq %r14, %rbp
+; CHECK-NEXT: shrq $32, %rbp
+; CHECK-NEXT: movq %r12, %r15
+; CHECK-NEXT: shrq $48, %r15
+; CHECK-NEXT: movq %r14, %r13
; CHECK-NEXT: shrq $48, %r13
-; CHECK-NEXT: movq %rbp, %r12
-; CHECK-NEXT: shrq $48, %r12
-; CHECK-NEXT: movl %ebp, %eax
+; CHECK-NEXT: movl %r14d, %eax
; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
; CHECK-NEXT: movd %eax, %xmm1
-; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: movl %r12d, %eax
; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2 at PLT
-; CHECK-NEXT: movd %xmm0, %r14d
-; CHECK-NEXT: shll $16, %r14d
-; CHECK-NEXT: shll $16, %ebp
-; CHECK-NEXT: movd %ebp, %xmm1
+; CHECK-NEXT: movd %xmm0, %ebx
; CHECK-NEXT: shll $16, %ebx
-; CHECK-NEXT: movd %ebx, %xmm0
+; CHECK-NEXT: shll $16, %r14d
+; CHECK-NEXT: movd %r14d, %xmm1
+; CHECK-NEXT: shll $16, %r12d
+; CHECK-NEXT: movd %r12d, %xmm0
; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2 at PLT
; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: movzwl %ax, %ebx
-; CHECK-NEXT: orl %r14d, %ebx
-; CHECK-NEXT: shll $16, %r12d
-; CHECK-NEXT: movd %r12d, %xmm1
+; CHECK-NEXT: movzwl %ax, %r12d
+; CHECK-NEXT: orl %ebx, %r12d
; CHECK-NEXT: shll $16, %r13d
-; CHECK-NEXT: movd %r13d, %xmm0
+; CHECK-NEXT: movd %r13d, %xmm1
+; CHECK-NEXT: shll $16, %r15d
+; CHECK-NEXT: movd %r15d, %xmm0
; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2 at PLT
-; CHECK-NEXT: movd %xmm0, %ebp
+; CHECK-NEXT: movd %xmm0, %r14d
+; CHECK-NEXT: shll $16, %r14d
; CHECK-NEXT: shll $16, %ebp
-; CHECK-NEXT: shll $16, %r15d
-; CHECK-NEXT: movd %r15d, %xmm1
+; CHECK-NEXT: movd %ebp, %xmm1
; CHECK-NEXT: movq (%rsp), %rax # 8-byte Reload
; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2 at PLT
; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: movzwl %ax, %r14d
-; CHECK-NEXT: orl %ebp, %r14d
-; CHECK-NEXT: shlq $32, %r14
-; CHECK-NEXT: orq %rbx, %r14
+; CHECK-NEXT: movzwl %ax, %ebx
+; CHECK-NEXT: orl %r14d, %ebx
+; CHECK-NEXT: shlq $32, %rbx
+; CHECK-NEXT: orq %r12, %rbx
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
; CHECK-NEXT: movl %r15d, %eax
; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
; CHECK-NEXT: movd %eax, %xmm1
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; CHECK-NEXT: movl %r14d, %eax
; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: addss %xmm1, %xmm0
@@ -273,14 +273,14 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; CHECK-NEXT: movq %r15, %rax
; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movd %eax, %xmm1
-; CHECK-NEXT: movq %rbx, %rax
+; CHECK-NEXT: movq %r14, %rax
; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2 at PLT
; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: movzwl %ax, %ebx
-; CHECK-NEXT: orl %ebp, %ebx
+; CHECK-NEXT: movzwl %ax, %r14d
+; CHECK-NEXT: orl %ebp, %r14d
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movd %eax, %xmm1
@@ -303,9 +303,9 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: orl %ebp, %eax
; CHECK-NEXT: shlq $32, %rax
-; CHECK-NEXT: orq %rbx, %rax
+; CHECK-NEXT: orq %r14, %rax
; CHECK-NEXT: movq %rax, %xmm0
-; CHECK-NEXT: movq %r14, %xmm1
+; CHECK-NEXT: movq %rbx, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: addq $56, %rsp
; CHECK-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
index 7deacda597519..fde72b4cc08b3 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
@@ -38,13 +38,13 @@ define i8 @v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm9
; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm8
+; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4
+; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
@@ -222,18 +222,18 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
; AVX1-NEXT: vpcmpgtw %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm8
+; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
-; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpcmpgtw %xmm7, %xmm5, %xmm2
-; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1
+; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm7, %xmm5, %xmm3
+; AVX1-NEXT: vpacksswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
@@ -330,13 +330,13 @@ define i16 @v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
; AVX1-NEXT: vpcmpgtd %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm8
+; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm8, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
@@ -508,28 +508,28 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
; AVX1-NEXT: vpcmpgtb %xmm8, %xmm9, %xmm8
-; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm9
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm9, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
-; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm9
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm9, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2
-; AVX1-NEXT: vpcmpgtb %xmm7, %xmm5, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7
; AVX1-NEXT: vpcmpgtb %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: vpmovmskb %xmm1, %ecx
+; AVX1-NEXT: vpmovmskb %xmm3, %ecx
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpmovmskb %xmm3, %edx
+; AVX1-NEXT: vpmovmskb %xmm1, %edx
; AVX1-NEXT: vpmovmskb %xmm2, %eax
; AVX1-NEXT: shll $16, %eax
; AVX1-NEXT: orl %edx, %eax
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 3c0f20ab7aa07..2203d82907930 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -1112,7 +1112,6 @@ define i528 @large_promotion(i528 %A) nounwind {
;
; X64-LABEL: large_promotion:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbp
; X64-NEXT: pushq %r15
; X64-NEXT: pushq %r14
; X64-NEXT: pushq %r13
@@ -1121,189 +1120,188 @@ define i528 @large_promotion(i528 %A) nounwind {
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: bswapq %rdi
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: shrq $4, %rbx
-; X64-NEXT: movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %r13, %rbx
-; X64-NEXT: andq %r13, %rdi
+; X64-NEXT: movq %rdi, %r10
+; X64-NEXT: shrq $4, %r10
+; X64-NEXT: movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %r11, %r10
+; X64-NEXT: andq %r11, %rdi
; X64-NEXT: shlq $4, %rdi
-; X64-NEXT: orq %rbx, %rdi
-; X64-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: andq %r11, %rbx
+; X64-NEXT: orq %r10, %rdi
+; X64-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333
+; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: andq %r10, %r14
; X64-NEXT: shrq $2, %rdi
-; X64-NEXT: andq %r11, %rdi
-; X64-NEXT: leaq (%rdi,%rbx,4), %rdi
-; X64-NEXT: movabsq $6148820866244280320, %r10 # imm = 0x5555000000000000
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: andq %r10, %rbx
-; X64-NEXT: shrq %rdi
; X64-NEXT: andq %r10, %rdi
-; X64-NEXT: leaq (%rdi,%rbx,2), %r10
-; X64-NEXT: bswapq %rbp
-; X64-NEXT: movq %rbp, %rdi
-; X64-NEXT: shrq $4, %rdi
-; X64-NEXT: andq %r13, %rdi
-; X64-NEXT: andq %r13, %rbp
-; X64-NEXT: shlq $4, %rbp
-; X64-NEXT: orq %rdi, %rbp
-; X64-NEXT: movq %rbp, %rdi
-; X64-NEXT: andq %r11, %rdi
-; X64-NEXT: shrq $2, %rbp
-; X64-NEXT: andq %r11, %rbp
-; X64-NEXT: leaq (%rbp,%rdi,4), %rdi
-; X64-NEXT: movabsq $6148914691236517205, %rbp # imm = 0x5555555555555555
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: andq %rbp, %rbx
+; X64-NEXT: leaq (%rdi,%r14,4), %rdi
+; X64-NEXT: movabsq $6148820866244280320, %r14 # imm = 0x5555000000000000
+; X64-NEXT: movq %rdi, %r13
+; X64-NEXT: andq %r14, %r13
; X64-NEXT: shrq %rdi
-; X64-NEXT: andq %rbp, %rdi
-; X64-NEXT: leaq (%rdi,%rbx,2), %r14
-; X64-NEXT: shrdq $48, %r14, %r10
+; X64-NEXT: andq %r14, %rdi
+; X64-NEXT: leaq (%rdi,%r13,2), %rdi
+; X64-NEXT: bswapq %rbx
+; X64-NEXT: movq %rbx, %r14
+; X64-NEXT: shrq $4, %r14
+; X64-NEXT: andq %r11, %r14
+; X64-NEXT: andq %r11, %rbx
+; X64-NEXT: shlq $4, %rbx
+; X64-NEXT: orq %r14, %rbx
+; X64-NEXT: movq %rbx, %r14
+; X64-NEXT: andq %r10, %r14
+; X64-NEXT: shrq $2, %rbx
+; X64-NEXT: andq %r10, %rbx
+; X64-NEXT: leaq (%rbx,%r14,4), %rbx
+; X64-NEXT: movabsq $6148914691236517205, %r14 # imm = 0x5555555555555555
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: andq %r14, %r13
+; X64-NEXT: shrq %rbx
+; X64-NEXT: andq %r14, %rbx
+; X64-NEXT: leaq (%rbx,%r13,2), %rbx
+; X64-NEXT: shrdq $48, %rbx, %rdi
; X64-NEXT: bswapq %r15
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: shrq $4, %rdi
-; X64-NEXT: andq %r13, %rdi
-; X64-NEXT: andq %r13, %r15
+; X64-NEXT: movq %r15, %r13
+; X64-NEXT: shrq $4, %r13
+; X64-NEXT: andq %r11, %r13
+; X64-NEXT: andq %r11, %r15
; X64-NEXT: shlq $4, %r15
-; X64-NEXT: orq %rdi, %r15
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: andq %r11, %rdi
+; X64-NEXT: orq %r13, %r15
+; X64-NEXT: movq %r15, %r13
+; X64-NEXT: andq %r10, %r13
; X64-NEXT: shrq $2, %r15
-; X64-NEXT: andq %r11, %r15
-; X64-NEXT: leaq (%r15,%rdi,4), %rdi
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: andq %rbp, %rbx
-; X64-NEXT: shrq %rdi
-; X64-NEXT: andq %rbp, %rdi
-; X64-NEXT: leaq (%rdi,%rbx,2), %r15
-; X64-NEXT: shrdq $48, %r15, %r14
+; X64-NEXT: andq %r10, %r15
+; X64-NEXT: leaq (%r15,%r13,4), %r15
+; X64-NEXT: movq %r15, %r13
+; X64-NEXT: andq %r14, %r13
+; X64-NEXT: shrq %r15
+; X64-NEXT: andq %r14, %r15
+; X64-NEXT: leaq (%r15,%r13,2), %r15
+; X64-NEXT: shrdq $48, %r15, %rbx
; X64-NEXT: bswapq %r12
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: shrq $4, %rdi
-; X64-NEXT: andq %r13, %rdi
-; X64-NEXT: andq %r13, %r12
+; X64-NEXT: movq %r12, %r13
+; X64-NEXT: shrq $4, %r13
+; X64-NEXT: andq %r11, %r13
+; X64-NEXT: andq %r11, %r12
; X64-NEXT: shlq $4, %r12
-; X64-NEXT: orq %rdi, %r12
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: andq %r11, %rdi
+; X64-NEXT: orq %r13, %r12
+; X64-NEXT: movq %r12, %r13
+; X64-NEXT: andq %r10, %r13
; X64-NEXT: shrq $2, %r12
-; X64-NEXT: andq %r11, %r12
-; X64-NEXT: leaq (%r12,%rdi,4), %rdi
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: andq %rbp, %rbx
-; X64-NEXT: shrq %rdi
-; X64-NEXT: andq %rbp, %rdi
-; X64-NEXT: leaq (%rdi,%rbx,2), %r12
+; X64-NEXT: andq %r10, %r12
+; X64-NEXT: leaq (%r12,%r13,4), %r12
+; X64-NEXT: movq %r12, %r13
+; X64-NEXT: andq %r14, %r13
+; X64-NEXT: shrq %r12
+; X64-NEXT: andq %r14, %r12
+; X64-NEXT: leaq (%r12,%r13,2), %r12
; X64-NEXT: shrdq $48, %r12, %r15
; X64-NEXT: bswapq %r9
-; X64-NEXT: movq %r9, %rdi
-; X64-NEXT: shrq $4, %rdi
-; X64-NEXT: andq %r13, %rdi
-; X64-NEXT: andq %r13, %r9
+; X64-NEXT: movq %r9, %r13
+; X64-NEXT: shrq $4, %r13
+; X64-NEXT: andq %r11, %r13
+; X64-NEXT: andq %r11, %r9
; X64-NEXT: shlq $4, %r9
-; X64-NEXT: orq %rdi, %r9
-; X64-NEXT: movq %r9, %rdi
-; X64-NEXT: andq %r11, %rdi
+; X64-NEXT: orq %r13, %r9
+; X64-NEXT: movq %r9, %r13
+; X64-NEXT: andq %r10, %r13
; X64-NEXT: shrq $2, %r9
-; X64-NEXT: andq %r11, %r9
-; X64-NEXT: leaq (%r9,%rdi,4), %rdi
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: andq %rbp, %rbx
-; X64-NEXT: shrq %rdi
-; X64-NEXT: andq %rbp, %rdi
-; X64-NEXT: leaq (%rdi,%rbx,2), %r9
+; X64-NEXT: andq %r10, %r9
+; X64-NEXT: leaq (%r9,%r13,4), %r9
+; X64-NEXT: movq %r9, %r13
+; X64-NEXT: andq %r14, %r13
+; X64-NEXT: shrq %r9
+; X64-NEXT: andq %r14, %r9
+; X64-NEXT: leaq (%r9,%r13,2), %r9
; X64-NEXT: shrdq $48, %r9, %r12
; X64-NEXT: bswapq %r8
-; X64-NEXT: movq %r8, %rdi
-; X64-NEXT: shrq $4, %rdi
-; X64-NEXT: andq %r13, %rdi
-; X64-NEXT: andq %r13, %r8
+; X64-NEXT: movq %r8, %r13
+; X64-NEXT: shrq $4, %r13
+; X64-NEXT: andq %r11, %r13
+; X64-NEXT: andq %r11, %r8
; X64-NEXT: shlq $4, %r8
-; X64-NEXT: orq %rdi, %r8
-; X64-NEXT: movq %r8, %rdi
-; X64-NEXT: andq %r11, %rdi
+; X64-NEXT: orq %r13, %r8
+; X64-NEXT: movq %r8, %r13
+; X64-NEXT: andq %r10, %r13
; X64-NEXT: shrq $2, %r8
-; X64-NEXT: andq %r11, %r8
-; X64-NEXT: leaq (%r8,%rdi,4), %rdi
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: andq %rbp, %rbx
-; X64-NEXT: shrq %rdi
-; X64-NEXT: andq %rbp, %rdi
-; X64-NEXT: leaq (%rdi,%rbx,2), %rdi
-; X64-NEXT: shrdq $48, %rdi, %r9
+; X64-NEXT: andq %r10, %r8
+; X64-NEXT: leaq (%r8,%r13,4), %r8
+; X64-NEXT: movq %r8, %r13
+; X64-NEXT: andq %r14, %r13
+; X64-NEXT: shrq %r8
+; X64-NEXT: andq %r14, %r8
+; X64-NEXT: leaq (%r8,%r13,2), %r8
+; X64-NEXT: shrdq $48, %r8, %r9
; X64-NEXT: bswapq %rcx
-; X64-NEXT: movq %rcx, %rbx
-; X64-NEXT: shrq $4, %rbx
-; X64-NEXT: andq %r13, %rbx
-; X64-NEXT: andq %r13, %rcx
+; X64-NEXT: movq %rcx, %r13
+; X64-NEXT: shrq $4, %r13
+; X64-NEXT: andq %r11, %r13
+; X64-NEXT: andq %r11, %rcx
; X64-NEXT: shlq $4, %rcx
-; X64-NEXT: orq %rbx, %rcx
-; X64-NEXT: movq %rcx, %rbx
-; X64-NEXT: andq %r11, %rbx
+; X64-NEXT: orq %r13, %rcx
+; X64-NEXT: movq %rcx, %r13
+; X64-NEXT: andq %r10, %r13
; X64-NEXT: shrq $2, %rcx
-; X64-NEXT: andq %r11, %rcx
-; X64-NEXT: leaq (%rcx,%rbx,4), %rcx
-; X64-NEXT: movq %rcx, %rbx
-; X64-NEXT: andq %rbp, %rbx
+; X64-NEXT: andq %r10, %rcx
+; X64-NEXT: leaq (%rcx,%r13,4), %rcx
+; X64-NEXT: movq %rcx, %r13
+; X64-NEXT: andq %r14, %r13
; X64-NEXT: shrq %rcx
-; X64-NEXT: andq %rbp, %rcx
-; X64-NEXT: leaq (%rcx,%rbx,2), %rcx
-; X64-NEXT: shrdq $48, %rcx, %rdi
+; X64-NEXT: andq %r14, %rcx
+; X64-NEXT: leaq (%rcx,%r13,2), %rcx
+; X64-NEXT: shrdq $48, %rcx, %r8
; X64-NEXT: bswapq %rdx
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: shrq $4, %rbx
-; X64-NEXT: andq %r13, %rbx
-; X64-NEXT: andq %r13, %rdx
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: shrq $4, %r13
+; X64-NEXT: andq %r11, %r13
+; X64-NEXT: andq %r11, %rdx
; X64-NEXT: shlq $4, %rdx
-; X64-NEXT: orq %rbx, %rdx
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: andq %r11, %rbx
+; X64-NEXT: orq %r13, %rdx
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: andq %r10, %r13
; X64-NEXT: shrq $2, %rdx
-; X64-NEXT: andq %r11, %rdx
-; X64-NEXT: leaq (%rdx,%rbx,4), %rdx
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: andq %rbp, %rbx
+; X64-NEXT: andq %r10, %rdx
+; X64-NEXT: leaq (%rdx,%r13,4), %rdx
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: andq %r14, %r13
; X64-NEXT: shrq %rdx
-; X64-NEXT: andq %rbp, %rdx
-; X64-NEXT: leaq (%rdx,%rbx,2), %rdx
+; X64-NEXT: andq %r14, %rdx
+; X64-NEXT: leaq (%rdx,%r13,2), %rdx
; X64-NEXT: shrdq $48, %rdx, %rcx
; X64-NEXT: bswapq %rsi
-; X64-NEXT: movq %rsi, %rbx
-; X64-NEXT: shrq $4, %rbx
-; X64-NEXT: andq %r13, %rbx
-; X64-NEXT: andq %r13, %rsi
+; X64-NEXT: movq %rsi, %r13
+; X64-NEXT: shrq $4, %r13
+; X64-NEXT: andq %r11, %r13
+; X64-NEXT: andq %r11, %rsi
; X64-NEXT: shlq $4, %rsi
-; X64-NEXT: orq %rbx, %rsi
-; X64-NEXT: movq %rsi, %rbx
-; X64-NEXT: andq %r11, %rbx
+; X64-NEXT: orq %r13, %rsi
+; X64-NEXT: movq %rsi, %r11
+; X64-NEXT: andq %r10, %r11
; X64-NEXT: shrq $2, %rsi
-; X64-NEXT: andq %r11, %rsi
-; X64-NEXT: leaq (%rsi,%rbx,4), %rsi
-; X64-NEXT: movq %rsi, %rbx
-; X64-NEXT: andq %rbp, %rbx
+; X64-NEXT: andq %r10, %rsi
+; X64-NEXT: leaq (%rsi,%r11,4), %rsi
+; X64-NEXT: movq %rsi, %r10
+; X64-NEXT: andq %r14, %r10
; X64-NEXT: shrq %rsi
-; X64-NEXT: andq %rbp, %rsi
-; X64-NEXT: leaq (%rsi,%rbx,2), %rsi
+; X64-NEXT: andq %r14, %rsi
+; X64-NEXT: leaq (%rsi,%r10,2), %rsi
; X64-NEXT: shrdq $48, %rsi, %rdx
; X64-NEXT: shrq $48, %rsi
; X64-NEXT: movq %rdx, 56(%rax)
; X64-NEXT: movq %rcx, 48(%rax)
-; X64-NEXT: movq %rdi, 40(%rax)
+; X64-NEXT: movq %r8, 40(%rax)
; X64-NEXT: movq %r9, 32(%rax)
; X64-NEXT: movq %r12, 24(%rax)
; X64-NEXT: movq %r15, 16(%rax)
-; X64-NEXT: movq %r14, 8(%rax)
-; X64-NEXT: movq %r10, (%rax)
+; X64-NEXT: movq %rbx, 8(%rax)
+; X64-NEXT: movq %rdi, (%rax)
; X64-NEXT: movw %si, 64(%rax)
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X64-NEXT: popq %r13
; X64-NEXT: popq %r14
; X64-NEXT: popq %r15
-; X64-NEXT: popq %rbp
; X64-NEXT: retq
;
; X86XOP-LABEL: large_promotion:
@@ -1415,7 +1413,6 @@ define i528 @large_promotion(i528 %A) nounwind {
;
; GFNI-LABEL: large_promotion:
; GFNI: # %bb.0:
-; GFNI-NEXT: pushq %rbp
; GFNI-NEXT: pushq %r15
; GFNI-NEXT: pushq %r14
; GFNI-NEXT: pushq %r13
@@ -1424,189 +1421,188 @@ define i528 @large_promotion(i528 %A) nounwind {
; GFNI-NEXT: movq %rdi, %rax
; GFNI-NEXT: movq {{[0-9]+}}(%rsp), %r12
; GFNI-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; GFNI-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; GFNI-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; GFNI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; GFNI-NEXT: bswapq %rdi
-; GFNI-NEXT: movq %rdi, %rbx
-; GFNI-NEXT: shrq $4, %rbx
-; GFNI-NEXT: movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F
-; GFNI-NEXT: andq %r13, %rbx
-; GFNI-NEXT: andq %r13, %rdi
+; GFNI-NEXT: movq %rdi, %r10
+; GFNI-NEXT: shrq $4, %r10
+; GFNI-NEXT: movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F
+; GFNI-NEXT: andq %r11, %r10
+; GFNI-NEXT: andq %r11, %rdi
; GFNI-NEXT: shlq $4, %rdi
-; GFNI-NEXT: orq %rbx, %rdi
-; GFNI-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333
-; GFNI-NEXT: movq %rdi, %rbx
-; GFNI-NEXT: andq %r11, %rbx
+; GFNI-NEXT: orq %r10, %rdi
+; GFNI-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333
+; GFNI-NEXT: movq %rdi, %r14
+; GFNI-NEXT: andq %r10, %r14
; GFNI-NEXT: shrq $2, %rdi
-; GFNI-NEXT: andq %r11, %rdi
-; GFNI-NEXT: leaq (%rdi,%rbx,4), %rdi
-; GFNI-NEXT: movabsq $6148820866244280320, %r10 # imm = 0x5555000000000000
-; GFNI-NEXT: movq %rdi, %rbx
-; GFNI-NEXT: andq %r10, %rbx
-; GFNI-NEXT: shrq %rdi
; GFNI-NEXT: andq %r10, %rdi
-; GFNI-NEXT: leaq (%rdi,%rbx,2), %r10
-; GFNI-NEXT: bswapq %rbp
-; GFNI-NEXT: movq %rbp, %rdi
-; GFNI-NEXT: shrq $4, %rdi
-; GFNI-NEXT: andq %r13, %rdi
-; GFNI-NEXT: andq %r13, %rbp
-; GFNI-NEXT: shlq $4, %rbp
-; GFNI-NEXT: orq %rdi, %rbp
-; GFNI-NEXT: movq %rbp, %rdi
-; GFNI-NEXT: andq %r11, %rdi
-; GFNI-NEXT: shrq $2, %rbp
-; GFNI-NEXT: andq %r11, %rbp
-; GFNI-NEXT: leaq (%rbp,%rdi,4), %rdi
-; GFNI-NEXT: movabsq $6148914691236517205, %rbp # imm = 0x5555555555555555
-; GFNI-NEXT: movq %rdi, %rbx
-; GFNI-NEXT: andq %rbp, %rbx
+; GFNI-NEXT: leaq (%rdi,%r14,4), %rdi
+; GFNI-NEXT: movabsq $6148820866244280320, %r14 # imm = 0x5555000000000000
+; GFNI-NEXT: movq %rdi, %r13
+; GFNI-NEXT: andq %r14, %r13
; GFNI-NEXT: shrq %rdi
-; GFNI-NEXT: andq %rbp, %rdi
-; GFNI-NEXT: leaq (%rdi,%rbx,2), %r14
-; GFNI-NEXT: shrdq $48, %r14, %r10
+; GFNI-NEXT: andq %r14, %rdi
+; GFNI-NEXT: leaq (%rdi,%r13,2), %rdi
+; GFNI-NEXT: bswapq %rbx
+; GFNI-NEXT: movq %rbx, %r14
+; GFNI-NEXT: shrq $4, %r14
+; GFNI-NEXT: andq %r11, %r14
+; GFNI-NEXT: andq %r11, %rbx
+; GFNI-NEXT: shlq $4, %rbx
+; GFNI-NEXT: orq %r14, %rbx
+; GFNI-NEXT: movq %rbx, %r14
+; GFNI-NEXT: andq %r10, %r14
+; GFNI-NEXT: shrq $2, %rbx
+; GFNI-NEXT: andq %r10, %rbx
+; GFNI-NEXT: leaq (%rbx,%r14,4), %rbx
+; GFNI-NEXT: movabsq $6148914691236517205, %r14 # imm = 0x5555555555555555
+; GFNI-NEXT: movq %rbx, %r13
+; GFNI-NEXT: andq %r14, %r13
+; GFNI-NEXT: shrq %rbx
+; GFNI-NEXT: andq %r14, %rbx
+; GFNI-NEXT: leaq (%rbx,%r13,2), %rbx
+; GFNI-NEXT: shrdq $48, %rbx, %rdi
; GFNI-NEXT: bswapq %r15
-; GFNI-NEXT: movq %r15, %rdi
-; GFNI-NEXT: shrq $4, %rdi
-; GFNI-NEXT: andq %r13, %rdi
-; GFNI-NEXT: andq %r13, %r15
+; GFNI-NEXT: movq %r15, %r13
+; GFNI-NEXT: shrq $4, %r13
+; GFNI-NEXT: andq %r11, %r13
+; GFNI-NEXT: andq %r11, %r15
; GFNI-NEXT: shlq $4, %r15
-; GFNI-NEXT: orq %rdi, %r15
-; GFNI-NEXT: movq %r15, %rdi
-; GFNI-NEXT: andq %r11, %rdi
+; GFNI-NEXT: orq %r13, %r15
+; GFNI-NEXT: movq %r15, %r13
+; GFNI-NEXT: andq %r10, %r13
; GFNI-NEXT: shrq $2, %r15
-; GFNI-NEXT: andq %r11, %r15
-; GFNI-NEXT: leaq (%r15,%rdi,4), %rdi
-; GFNI-NEXT: movq %rdi, %rbx
-; GFNI-NEXT: andq %rbp, %rbx
-; GFNI-NEXT: shrq %rdi
-; GFNI-NEXT: andq %rbp, %rdi
-; GFNI-NEXT: leaq (%rdi,%rbx,2), %r15
-; GFNI-NEXT: shrdq $48, %r15, %r14
+; GFNI-NEXT: andq %r10, %r15
+; GFNI-NEXT: leaq (%r15,%r13,4), %r15
+; GFNI-NEXT: movq %r15, %r13
+; GFNI-NEXT: andq %r14, %r13
+; GFNI-NEXT: shrq %r15
+; GFNI-NEXT: andq %r14, %r15
+; GFNI-NEXT: leaq (%r15,%r13,2), %r15
+; GFNI-NEXT: shrdq $48, %r15, %rbx
; GFNI-NEXT: bswapq %r12
-; GFNI-NEXT: movq %r12, %rdi
-; GFNI-NEXT: shrq $4, %rdi
-; GFNI-NEXT: andq %r13, %rdi
-; GFNI-NEXT: andq %r13, %r12
+; GFNI-NEXT: movq %r12, %r13
+; GFNI-NEXT: shrq $4, %r13
+; GFNI-NEXT: andq %r11, %r13
+; GFNI-NEXT: andq %r11, %r12
; GFNI-NEXT: shlq $4, %r12
-; GFNI-NEXT: orq %rdi, %r12
-; GFNI-NEXT: movq %r12, %rdi
-; GFNI-NEXT: andq %r11, %rdi
+; GFNI-NEXT: orq %r13, %r12
+; GFNI-NEXT: movq %r12, %r13
+; GFNI-NEXT: andq %r10, %r13
; GFNI-NEXT: shrq $2, %r12
-; GFNI-NEXT: andq %r11, %r12
-; GFNI-NEXT: leaq (%r12,%rdi,4), %rdi
-; GFNI-NEXT: movq %rdi, %rbx
-; GFNI-NEXT: andq %rbp, %rbx
-; GFNI-NEXT: shrq %rdi
-; GFNI-NEXT: andq %rbp, %rdi
-; GFNI-NEXT: leaq (%rdi,%rbx,2), %r12
+; GFNI-NEXT: andq %r10, %r12
+; GFNI-NEXT: leaq (%r12,%r13,4), %r12
+; GFNI-NEXT: movq %r12, %r13
+; GFNI-NEXT: andq %r14, %r13
+; GFNI-NEXT: shrq %r12
+; GFNI-NEXT: andq %r14, %r12
+; GFNI-NEXT: leaq (%r12,%r13,2), %r12
; GFNI-NEXT: shrdq $48, %r12, %r15
; GFNI-NEXT: bswapq %r9
-; GFNI-NEXT: movq %r9, %rdi
-; GFNI-NEXT: shrq $4, %rdi
-; GFNI-NEXT: andq %r13, %rdi
-; GFNI-NEXT: andq %r13, %r9
+; GFNI-NEXT: movq %r9, %r13
+; GFNI-NEXT: shrq $4, %r13
+; GFNI-NEXT: andq %r11, %r13
+; GFNI-NEXT: andq %r11, %r9
; GFNI-NEXT: shlq $4, %r9
-; GFNI-NEXT: orq %rdi, %r9
-; GFNI-NEXT: movq %r9, %rdi
-; GFNI-NEXT: andq %r11, %rdi
+; GFNI-NEXT: orq %r13, %r9
+; GFNI-NEXT: movq %r9, %r13
+; GFNI-NEXT: andq %r10, %r13
; GFNI-NEXT: shrq $2, %r9
-; GFNI-NEXT: andq %r11, %r9
-; GFNI-NEXT: leaq (%r9,%rdi,4), %rdi
-; GFNI-NEXT: movq %rdi, %rbx
-; GFNI-NEXT: andq %rbp, %rbx
-; GFNI-NEXT: shrq %rdi
-; GFNI-NEXT: andq %rbp, %rdi
-; GFNI-NEXT: leaq (%rdi,%rbx,2), %r9
+; GFNI-NEXT: andq %r10, %r9
+; GFNI-NEXT: leaq (%r9,%r13,4), %r9
+; GFNI-NEXT: movq %r9, %r13
+; GFNI-NEXT: andq %r14, %r13
+; GFNI-NEXT: shrq %r9
+; GFNI-NEXT: andq %r14, %r9
+; GFNI-NEXT: leaq (%r9,%r13,2), %r9
; GFNI-NEXT: shrdq $48, %r9, %r12
; GFNI-NEXT: bswapq %r8
-; GFNI-NEXT: movq %r8, %rdi
-; GFNI-NEXT: shrq $4, %rdi
-; GFNI-NEXT: andq %r13, %rdi
-; GFNI-NEXT: andq %r13, %r8
+; GFNI-NEXT: movq %r8, %r13
+; GFNI-NEXT: shrq $4, %r13
+; GFNI-NEXT: andq %r11, %r13
+; GFNI-NEXT: andq %r11, %r8
; GFNI-NEXT: shlq $4, %r8
-; GFNI-NEXT: orq %rdi, %r8
-; GFNI-NEXT: movq %r8, %rdi
-; GFNI-NEXT: andq %r11, %rdi
+; GFNI-NEXT: orq %r13, %r8
+; GFNI-NEXT: movq %r8, %r13
+; GFNI-NEXT: andq %r10, %r13
; GFNI-NEXT: shrq $2, %r8
-; GFNI-NEXT: andq %r11, %r8
-; GFNI-NEXT: leaq (%r8,%rdi,4), %rdi
-; GFNI-NEXT: movq %rdi, %rbx
-; GFNI-NEXT: andq %rbp, %rbx
-; GFNI-NEXT: shrq %rdi
-; GFNI-NEXT: andq %rbp, %rdi
-; GFNI-NEXT: leaq (%rdi,%rbx,2), %rdi
-; GFNI-NEXT: shrdq $48, %rdi, %r9
+; GFNI-NEXT: andq %r10, %r8
+; GFNI-NEXT: leaq (%r8,%r13,4), %r8
+; GFNI-NEXT: movq %r8, %r13
+; GFNI-NEXT: andq %r14, %r13
+; GFNI-NEXT: shrq %r8
+; GFNI-NEXT: andq %r14, %r8
+; GFNI-NEXT: leaq (%r8,%r13,2), %r8
+; GFNI-NEXT: shrdq $48, %r8, %r9
; GFNI-NEXT: bswapq %rcx
-; GFNI-NEXT: movq %rcx, %rbx
-; GFNI-NEXT: shrq $4, %rbx
-; GFNI-NEXT: andq %r13, %rbx
-; GFNI-NEXT: andq %r13, %rcx
+; GFNI-NEXT: movq %rcx, %r13
+; GFNI-NEXT: shrq $4, %r13
+; GFNI-NEXT: andq %r11, %r13
+; GFNI-NEXT: andq %r11, %rcx
; GFNI-NEXT: shlq $4, %rcx
-; GFNI-NEXT: orq %rbx, %rcx
-; GFNI-NEXT: movq %rcx, %rbx
-; GFNI-NEXT: andq %r11, %rbx
+; GFNI-NEXT: orq %r13, %rcx
+; GFNI-NEXT: movq %rcx, %r13
+; GFNI-NEXT: andq %r10, %r13
; GFNI-NEXT: shrq $2, %rcx
-; GFNI-NEXT: andq %r11, %rcx
-; GFNI-NEXT: leaq (%rcx,%rbx,4), %rcx
-; GFNI-NEXT: movq %rcx, %rbx
-; GFNI-NEXT: andq %rbp, %rbx
+; GFNI-NEXT: andq %r10, %rcx
+; GFNI-NEXT: leaq (%rcx,%r13,4), %rcx
+; GFNI-NEXT: movq %rcx, %r13
+; GFNI-NEXT: andq %r14, %r13
; GFNI-NEXT: shrq %rcx
-; GFNI-NEXT: andq %rbp, %rcx
-; GFNI-NEXT: leaq (%rcx,%rbx,2), %rcx
-; GFNI-NEXT: shrdq $48, %rcx, %rdi
+; GFNI-NEXT: andq %r14, %rcx
+; GFNI-NEXT: leaq (%rcx,%r13,2), %rcx
+; GFNI-NEXT: shrdq $48, %rcx, %r8
; GFNI-NEXT: bswapq %rdx
-; GFNI-NEXT: movq %rdx, %rbx
-; GFNI-NEXT: shrq $4, %rbx
-; GFNI-NEXT: andq %r13, %rbx
-; GFNI-NEXT: andq %r13, %rdx
+; GFNI-NEXT: movq %rdx, %r13
+; GFNI-NEXT: shrq $4, %r13
+; GFNI-NEXT: andq %r11, %r13
+; GFNI-NEXT: andq %r11, %rdx
; GFNI-NEXT: shlq $4, %rdx
-; GFNI-NEXT: orq %rbx, %rdx
-; GFNI-NEXT: movq %rdx, %rbx
-; GFNI-NEXT: andq %r11, %rbx
+; GFNI-NEXT: orq %r13, %rdx
+; GFNI-NEXT: movq %rdx, %r13
+; GFNI-NEXT: andq %r10, %r13
; GFNI-NEXT: shrq $2, %rdx
-; GFNI-NEXT: andq %r11, %rdx
-; GFNI-NEXT: leaq (%rdx,%rbx,4), %rdx
-; GFNI-NEXT: movq %rdx, %rbx
-; GFNI-NEXT: andq %rbp, %rbx
+; GFNI-NEXT: andq %r10, %rdx
+; GFNI-NEXT: leaq (%rdx,%r13,4), %rdx
+; GFNI-NEXT: movq %rdx, %r13
+; GFNI-NEXT: andq %r14, %r13
; GFNI-NEXT: shrq %rdx
-; GFNI-NEXT: andq %rbp, %rdx
-; GFNI-NEXT: leaq (%rdx,%rbx,2), %rdx
+; GFNI-NEXT: andq %r14, %rdx
+; GFNI-NEXT: leaq (%rdx,%r13,2), %rdx
; GFNI-NEXT: shrdq $48, %rdx, %rcx
; GFNI-NEXT: bswapq %rsi
-; GFNI-NEXT: movq %rsi, %rbx
-; GFNI-NEXT: shrq $4, %rbx
-; GFNI-NEXT: andq %r13, %rbx
-; GFNI-NEXT: andq %r13, %rsi
+; GFNI-NEXT: movq %rsi, %r13
+; GFNI-NEXT: shrq $4, %r13
+; GFNI-NEXT: andq %r11, %r13
+; GFNI-NEXT: andq %r11, %rsi
; GFNI-NEXT: shlq $4, %rsi
-; GFNI-NEXT: orq %rbx, %rsi
-; GFNI-NEXT: movq %rsi, %rbx
-; GFNI-NEXT: andq %r11, %rbx
+; GFNI-NEXT: orq %r13, %rsi
+; GFNI-NEXT: movq %rsi, %r11
+; GFNI-NEXT: andq %r10, %r11
; GFNI-NEXT: shrq $2, %rsi
-; GFNI-NEXT: andq %r11, %rsi
-; GFNI-NEXT: leaq (%rsi,%rbx,4), %rsi
-; GFNI-NEXT: movq %rsi, %rbx
-; GFNI-NEXT: andq %rbp, %rbx
+; GFNI-NEXT: andq %r10, %rsi
+; GFNI-NEXT: leaq (%rsi,%r11,4), %rsi
+; GFNI-NEXT: movq %rsi, %r10
+; GFNI-NEXT: andq %r14, %r10
; GFNI-NEXT: shrq %rsi
-; GFNI-NEXT: andq %rbp, %rsi
-; GFNI-NEXT: leaq (%rsi,%rbx,2), %rsi
+; GFNI-NEXT: andq %r14, %rsi
+; GFNI-NEXT: leaq (%rsi,%r10,2), %rsi
; GFNI-NEXT: shrdq $48, %rsi, %rdx
; GFNI-NEXT: shrq $48, %rsi
; GFNI-NEXT: movq %rdx, 56(%rax)
; GFNI-NEXT: movq %rcx, 48(%rax)
-; GFNI-NEXT: movq %rdi, 40(%rax)
+; GFNI-NEXT: movq %r8, 40(%rax)
; GFNI-NEXT: movq %r9, 32(%rax)
; GFNI-NEXT: movq %r12, 24(%rax)
; GFNI-NEXT: movq %r15, 16(%rax)
-; GFNI-NEXT: movq %r14, 8(%rax)
-; GFNI-NEXT: movq %r10, (%rax)
+; GFNI-NEXT: movq %rbx, 8(%rax)
+; GFNI-NEXT: movq %rdi, (%rax)
; GFNI-NEXT: movw %si, 64(%rax)
; GFNI-NEXT: popq %rbx
; GFNI-NEXT: popq %r12
; GFNI-NEXT: popq %r13
; GFNI-NEXT: popq %r14
; GFNI-NEXT: popq %r15
-; GFNI-NEXT: popq %rbp
; GFNI-NEXT: retq
%Z = call i528 @llvm.bitreverse.i528(i528 %A)
ret i528 %Z
diff --git a/llvm/test/CodeGen/X86/break-false-dep.ll b/llvm/test/CodeGen/X86/break-false-dep.ll
index 61143248a0055..8ff7fb2d351ad 100644
--- a/llvm/test/CodeGen/X86/break-false-dep.ll
+++ b/llvm/test/CodeGen/X86/break-false-dep.ll
@@ -309,22 +309,22 @@ define i64 @loopdep2(ptr nocapture %x, ptr nocapture %y) nounwind {
; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-WIN-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-WIN-NEXT: movq (%rcx), %rax
-; SSE-WIN-NEXT: movl $1, %r8d
+; SSE-WIN-NEXT: movl $1, %ecx
; SSE-WIN-NEXT: .p2align 4, 0x90
; SSE-WIN-NEXT: .LBB7_1: # %loop
; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1
; SSE-WIN-NEXT: xorps %xmm0, %xmm0
-; SSE-WIN-NEXT: cvtsi2sd %r8, %xmm0
+; SSE-WIN-NEXT: cvtsi2sd %rcx, %xmm0
; SSE-WIN-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE-WIN-NEXT: #APP
; SSE-WIN-NEXT: #NO_APP
; SSE-WIN-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; SSE-WIN-NEXT: # xmm0 = mem[0],zero
; SSE-WIN-NEXT: addsd (%rdx), %xmm0
-; SSE-WIN-NEXT: cvttsd2si %xmm0, %rcx
-; SSE-WIN-NEXT: addq %rcx, %rax
-; SSE-WIN-NEXT: incq %r8
-; SSE-WIN-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90
+; SSE-WIN-NEXT: cvttsd2si %xmm0, %r8
+; SSE-WIN-NEXT: addq %r8, %rax
+; SSE-WIN-NEXT: incq %rcx
+; SSE-WIN-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90
; SSE-WIN-NEXT: jne .LBB7_1
; SSE-WIN-NEXT: # %bb.2: # %ret
; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
@@ -354,22 +354,22 @@ define i64 @loopdep2(ptr nocapture %x, ptr nocapture %y) nounwind {
; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: movq (%rcx), %rax
-; AVX-NEXT: movl $1, %r8d
+; AVX-NEXT: movl $1, %ecx
; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: .LBB7_1: # %loop
; AVX-NEXT: # =>This Inner Loop Header: Depth=1
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vcvtsi2sd %r8, %xmm1, %xmm0
+; AVX-NEXT: vcvtsi2sd %rcx, %xmm1, %xmm0
; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX-NEXT: #APP
; AVX-NEXT: #NO_APP
; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; AVX-NEXT: # xmm0 = mem[0],zero
; AVX-NEXT: vaddsd (%rdx), %xmm0, %xmm0
-; AVX-NEXT: vcvttsd2si %xmm0, %rcx
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: incq %r8
-; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90
+; AVX-NEXT: vcvttsd2si %xmm0, %r8
+; AVX-NEXT: addq %r8, %rax
+; AVX-NEXT: incq %rcx
+; AVX-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90
; AVX-NEXT: jne .LBB7_1
; AVX-NEXT: # %bb.2: # %ret
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
@@ -472,38 +472,38 @@ define dso_local void @loopdep3() {
; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill
; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0
; SSE-WIN-NEXT: .seh_endprologue
-; SSE-WIN-NEXT: xorl %r9d, %r9d
-; SSE-WIN-NEXT: leaq v(%rip), %r8
-; SSE-WIN-NEXT: leaq x(%rip), %r10
-; SSE-WIN-NEXT: leaq y(%rip), %r11
-; SSE-WIN-NEXT: leaq z(%rip), %rax
-; SSE-WIN-NEXT: leaq w(%rip), %rdx
+; SSE-WIN-NEXT: xorl %eax, %eax
+; SSE-WIN-NEXT: leaq v(%rip), %rcx
+; SSE-WIN-NEXT: leaq x(%rip), %rdx
+; SSE-WIN-NEXT: leaq y(%rip), %r8
+; SSE-WIN-NEXT: leaq z(%rip), %r9
+; SSE-WIN-NEXT: leaq w(%rip), %r10
; SSE-WIN-NEXT: .p2align 4, 0x90
; SSE-WIN-NEXT: .LBB8_1: # %for.cond1.preheader
; SSE-WIN-NEXT: # =>This Loop Header: Depth=1
; SSE-WIN-NEXT: # Child Loop BB8_2 Depth 2
-; SSE-WIN-NEXT: movq %r8, %rcx
+; SSE-WIN-NEXT: movq %rcx, %r11
; SSE-WIN-NEXT: xorl %esi, %esi
; SSE-WIN-NEXT: .p2align 4, 0x90
; SSE-WIN-NEXT: .LBB8_2: # %for.body3
; SSE-WIN-NEXT: # Parent Loop BB8_1 Depth=1
; SSE-WIN-NEXT: # => This Inner Loop Header: Depth=2
; SSE-WIN-NEXT: xorps %xmm0, %xmm0
-; SSE-WIN-NEXT: cvtsi2sdl (%rcx), %xmm0
-; SSE-WIN-NEXT: mulsd (%rsi,%r10), %xmm0
-; SSE-WIN-NEXT: mulsd (%rsi,%r11), %xmm0
-; SSE-WIN-NEXT: mulsd (%rsi,%rax), %xmm0
-; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%rdx)
+; SSE-WIN-NEXT: cvtsi2sdl (%r11), %xmm0
+; SSE-WIN-NEXT: mulsd (%rsi,%rdx), %xmm0
+; SSE-WIN-NEXT: mulsd (%rsi,%r8), %xmm0
+; SSE-WIN-NEXT: mulsd (%rsi,%r9), %xmm0
+; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%r10)
; SSE-WIN-NEXT: #APP
; SSE-WIN-NEXT: #NO_APP
; SSE-WIN-NEXT: addq $8, %rsi
-; SSE-WIN-NEXT: addq $4, %rcx
+; SSE-WIN-NEXT: addq $4, %r11
; SSE-WIN-NEXT: cmpq $8192, %rsi # imm = 0x2000
; SSE-WIN-NEXT: jne .LBB8_2
; SSE-WIN-NEXT: # %bb.3: # %for.inc14
; SSE-WIN-NEXT: # in Loop: Header=BB8_1 Depth=1
-; SSE-WIN-NEXT: incl %r9d
-; SSE-WIN-NEXT: cmpl $100000, %r9d # imm = 0x186A0
+; SSE-WIN-NEXT: incl %eax
+; SSE-WIN-NEXT: cmpl $100000, %eax # imm = 0x186A0
; SSE-WIN-NEXT: jne .LBB8_1
; SSE-WIN-NEXT: # %bb.4: # %for.end16
; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload
@@ -548,38 +548,38 @@ define dso_local void @loopdep3() {
; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
; AVX-NEXT: .seh_savexmm %xmm6, 0
; AVX-NEXT: .seh_endprologue
-; AVX-NEXT: xorl %r9d, %r9d
-; AVX-NEXT: leaq v(%rip), %r8
-; AVX-NEXT: leaq x(%rip), %r10
-; AVX-NEXT: leaq y(%rip), %r11
-; AVX-NEXT: leaq z(%rip), %rax
-; AVX-NEXT: leaq w(%rip), %rdx
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: leaq v(%rip), %rcx
+; AVX-NEXT: leaq x(%rip), %rdx
+; AVX-NEXT: leaq y(%rip), %r8
+; AVX-NEXT: leaq z(%rip), %r9
+; AVX-NEXT: leaq w(%rip), %r10
; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: .LBB8_1: # %for.cond1.preheader
; AVX-NEXT: # =>This Loop Header: Depth=1
; AVX-NEXT: # Child Loop BB8_2 Depth 2
-; AVX-NEXT: movq %r8, %rcx
+; AVX-NEXT: movq %rcx, %r11
; AVX-NEXT: xorl %esi, %esi
; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: .LBB8_2: # %for.body3
; AVX-NEXT: # Parent Loop BB8_1 Depth=1
; AVX-NEXT: # => This Inner Loop Header: Depth=2
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vcvtsi2sdl (%rcx), %xmm0, %xmm0
-; AVX-NEXT: vmulsd (%rsi,%r10), %xmm0, %xmm0
-; AVX-NEXT: vmulsd (%rsi,%r11), %xmm0, %xmm0
-; AVX-NEXT: vmulsd (%rsi,%rax), %xmm0, %xmm0
-; AVX-NEXT: vmovsd %xmm0, (%rsi,%rdx)
+; AVX-NEXT: vcvtsi2sdl (%r11), %xmm0, %xmm0
+; AVX-NEXT: vmulsd (%rsi,%rdx), %xmm0, %xmm0
+; AVX-NEXT: vmulsd (%rsi,%r8), %xmm0, %xmm0
+; AVX-NEXT: vmulsd (%rsi,%r9), %xmm0, %xmm0
+; AVX-NEXT: vmovsd %xmm0, (%rsi,%r10)
; AVX-NEXT: #APP
; AVX-NEXT: #NO_APP
; AVX-NEXT: addq $8, %rsi
-; AVX-NEXT: addq $4, %rcx
+; AVX-NEXT: addq $4, %r11
; AVX-NEXT: cmpq $8192, %rsi # imm = 0x2000
; AVX-NEXT: jne .LBB8_2
; AVX-NEXT: # %bb.3: # %for.inc14
; AVX-NEXT: # in Loop: Header=BB8_1 Depth=1
-; AVX-NEXT: incl %r9d
-; AVX-NEXT: cmpl $100000, %r9d # imm = 0x186A0
+; AVX-NEXT: incl %eax
+; AVX-NEXT: cmpl $100000, %eax # imm = 0x186A0
; AVX-NEXT: jne .LBB8_1
; AVX-NEXT: # %bb.4: # %for.end16
; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
@@ -1154,12 +1154,12 @@ define i64 @loopclearence(ptr nocapture %x, ptr nocapture %y) nounwind {
; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-WIN-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill
; SSE-WIN-NEXT: movq (%rcx), %rax
-; SSE-WIN-NEXT: movl $1, %r8d
+; SSE-WIN-NEXT: movl $1, %ecx
; SSE-WIN-NEXT: .p2align 4, 0x90
; SSE-WIN-NEXT: .LBB12_1: # %loop
; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1
; SSE-WIN-NEXT: xorps %xmm4, %xmm4
-; SSE-WIN-NEXT: cvtsi2sd %r8, %xmm4
+; SSE-WIN-NEXT: cvtsi2sd %rcx, %xmm4
; SSE-WIN-NEXT: #APP
; SSE-WIN-NEXT: #NO_APP
; SSE-WIN-NEXT: #APP
@@ -1175,10 +1175,10 @@ define i64 @loopclearence(ptr nocapture %x, ptr nocapture %y) nounwind {
; SSE-WIN-NEXT: #APP
; SSE-WIN-NEXT: #NO_APP
; SSE-WIN-NEXT: addsd (%rdx), %xmm4
-; SSE-WIN-NEXT: cvttsd2si %xmm4, %rcx
-; SSE-WIN-NEXT: addq %rcx, %rax
-; SSE-WIN-NEXT: incq %r8
-; SSE-WIN-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90
+; SSE-WIN-NEXT: cvttsd2si %xmm4, %r8
+; SSE-WIN-NEXT: addq %r8, %rax
+; SSE-WIN-NEXT: incq %rcx
+; SSE-WIN-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90
; SSE-WIN-NEXT: jne .LBB12_1
; SSE-WIN-NEXT: # %bb.2: # %ret
; SSE-WIN-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload
@@ -1204,11 +1204,11 @@ define i64 @loopclearence(ptr nocapture %x, ptr nocapture %y) nounwind {
; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill
; AVX-NEXT: movq (%rcx), %rax
-; AVX-NEXT: movl $1, %r8d
+; AVX-NEXT: movl $1, %ecx
; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: .LBB12_1: # %loop
; AVX-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: vcvtsi2sd %r8, %xmm5, %xmm4
+; AVX-NEXT: vcvtsi2sd %rcx, %xmm5, %xmm4
; AVX-NEXT: #APP
; AVX-NEXT: #NO_APP
; AVX-NEXT: #APP
@@ -1224,10 +1224,10 @@ define i64 @loopclearence(ptr nocapture %x, ptr nocapture %y) nounwind {
; AVX-NEXT: #APP
; AVX-NEXT: #NO_APP
; AVX-NEXT: vaddsd (%rdx), %xmm4, %xmm0
-; AVX-NEXT: vcvttsd2si %xmm0, %rcx
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: incq %r8
-; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90
+; AVX-NEXT: vcvttsd2si %xmm0, %r8
+; AVX-NEXT: addq %r8, %rax
+; AVX-NEXT: incq %rcx
+; AVX-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90
; AVX-NEXT: jne .LBB12_1
; AVX-NEXT: # %bb.2: # %ret
; AVX-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
@@ -1286,34 +1286,34 @@ define dso_local void @loopclearance2(ptr nocapture %y, ptr %x, double %c1, doub
; SSE-LINUX-NEXT: #NO_APP
; SSE-LINUX-NEXT: #APP
; SSE-LINUX-NEXT: #NO_APP
-; SSE-LINUX-NEXT: movl $1, %r8d
+; SSE-LINUX-NEXT: movl $1, %eax
; SSE-LINUX-NEXT: xorl %ecx, %ecx
; SSE-LINUX-NEXT: .p2align 4, 0x90
; SSE-LINUX-NEXT: .LBB13_1: # %inner_loop
; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE-LINUX-NEXT: movq %rcx, %rax
-; SSE-LINUX-NEXT: shrq $6, %rcx
-; SSE-LINUX-NEXT: movq (%rsi,%rcx,8), %rcx
-; SSE-LINUX-NEXT: btq %rax, %rcx
-; SSE-LINUX-NEXT: leaq 1(%rax), %rcx
+; SSE-LINUX-NEXT: movq %rcx, %r8
+; SSE-LINUX-NEXT: shrq $6, %r8
+; SSE-LINUX-NEXT: movq (%rsi,%r8,8), %r8
+; SSE-LINUX-NEXT: btq %rcx, %r8
+; SSE-LINUX-NEXT: leaq 1(%rcx), %rcx
; SSE-LINUX-NEXT: jae .LBB13_1
; SSE-LINUX-NEXT: # %bb.2: # %loop_end
; SSE-LINUX-NEXT: # in Loop: Header=BB13_1 Depth=1
-; SSE-LINUX-NEXT: leaq 1(%r8), %r9
+; SSE-LINUX-NEXT: leaq 1(%rax), %r8
; SSE-LINUX-NEXT: xorps %xmm4, %xmm4
-; SSE-LINUX-NEXT: cvtsi2sd %r9, %xmm4
+; SSE-LINUX-NEXT: cvtsi2sd %r8, %xmm4
; SSE-LINUX-NEXT: movapd %xmm0, %xmm5
; SSE-LINUX-NEXT: subsd %xmm4, %xmm5
; SSE-LINUX-NEXT: mulsd %xmm1, %xmm5
-; SSE-LINUX-NEXT: leaq -1(%rcx), %rax
+; SSE-LINUX-NEXT: leaq -1(%rcx), %r9
; SSE-LINUX-NEXT: xorps %xmm4, %xmm4
-; SSE-LINUX-NEXT: cvtsi2sd %rax, %xmm4
+; SSE-LINUX-NEXT: cvtsi2sd %r9, %xmm4
; SSE-LINUX-NEXT: mulsd %xmm2, %xmm4
; SSE-LINUX-NEXT: addsd %xmm5, %xmm4
; SSE-LINUX-NEXT: divsd %xmm3, %xmm4
-; SSE-LINUX-NEXT: movsd %xmm4, -8(%rdi,%r8,8)
-; SSE-LINUX-NEXT: movq %r9, %r8
-; SSE-LINUX-NEXT: cmpq %r9, %rdx
+; SSE-LINUX-NEXT: movsd %xmm4, -8(%rdi,%rax,8)
+; SSE-LINUX-NEXT: movq %r8, %rax
+; SSE-LINUX-NEXT: cmpq %r8, %rdx
; SSE-LINUX-NEXT: jge .LBB13_1
; SSE-LINUX-NEXT: # %bb.3: # %loopdone
; SSE-LINUX-NEXT: retq
@@ -1341,7 +1341,7 @@ define dso_local void @loopclearance2(ptr nocapture %y, ptr %x, double %c1, doub
; SSE-WIN-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill
; SSE-WIN-NEXT: .seh_savexmm %xmm7, 0
; SSE-WIN-NEXT: .seh_endprologue
-; SSE-WIN-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; SSE-WIN-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; SSE-WIN-NEXT: #APP
@@ -1358,35 +1358,34 @@ define dso_local void @loopclearance2(ptr nocapture %y, ptr %x, double %c1, doub
; SSE-WIN-NEXT: #NO_APP
; SSE-WIN-NEXT: #APP
; SSE-WIN-NEXT: #NO_APP
-; SSE-WIN-NEXT: movl $1, %r9d
-; SSE-WIN-NEXT: xorl %r11d, %r11d
+; SSE-WIN-NEXT: movl $1, %r8d
+; SSE-WIN-NEXT: xorl %r9d, %r9d
; SSE-WIN-NEXT: .p2align 4, 0x90
; SSE-WIN-NEXT: .LBB13_1: # %inner_loop
; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE-WIN-NEXT: movq %r11, %r10
-; SSE-WIN-NEXT: movq %r11, %rax
-; SSE-WIN-NEXT: shrq $6, %rax
-; SSE-WIN-NEXT: movq (%rdx,%rax,8), %rax
-; SSE-WIN-NEXT: btq %r11, %rax
-; SSE-WIN-NEXT: leaq 1(%r11), %r11
+; SSE-WIN-NEXT: movq %r9, %r10
+; SSE-WIN-NEXT: shrq $6, %r10
+; SSE-WIN-NEXT: movq (%rdx,%r10,8), %r10
+; SSE-WIN-NEXT: btq %r9, %r10
+; SSE-WIN-NEXT: leaq 1(%r9), %r9
; SSE-WIN-NEXT: jae .LBB13_1
; SSE-WIN-NEXT: # %bb.2: # %loop_end
; SSE-WIN-NEXT: # in Loop: Header=BB13_1 Depth=1
-; SSE-WIN-NEXT: leaq 1(%r9), %r10
+; SSE-WIN-NEXT: leaq 1(%r8), %r10
; SSE-WIN-NEXT: xorps %xmm4, %xmm4
; SSE-WIN-NEXT: cvtsi2sd %r10, %xmm4
; SSE-WIN-NEXT: movapd %xmm2, %xmm5
; SSE-WIN-NEXT: subsd %xmm4, %xmm5
; SSE-WIN-NEXT: mulsd %xmm3, %xmm5
-; SSE-WIN-NEXT: leaq -1(%r11), %rax
+; SSE-WIN-NEXT: leaq -1(%r9), %r11
; SSE-WIN-NEXT: xorps %xmm4, %xmm4
-; SSE-WIN-NEXT: cvtsi2sd %rax, %xmm4
+; SSE-WIN-NEXT: cvtsi2sd %r11, %xmm4
; SSE-WIN-NEXT: mulsd %xmm1, %xmm4
; SSE-WIN-NEXT: addsd %xmm5, %xmm4
; SSE-WIN-NEXT: divsd %xmm0, %xmm4
-; SSE-WIN-NEXT: movsd %xmm4, -8(%rcx,%r9,8)
-; SSE-WIN-NEXT: movq %r10, %r9
-; SSE-WIN-NEXT: cmpq %r10, %r8
+; SSE-WIN-NEXT: movsd %xmm4, -8(%rcx,%r8,8)
+; SSE-WIN-NEXT: movq %r10, %r8
+; SSE-WIN-NEXT: cmpq %r10, %rax
; SSE-WIN-NEXT: jge .LBB13_1
; SSE-WIN-NEXT: # %bb.3: # %loopdone
; SSE-WIN-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload
@@ -1425,7 +1424,7 @@ define dso_local void @loopclearance2(ptr nocapture %y, ptr %x, double %c1, doub
; AVX1-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill
; AVX1-NEXT: .seh_savexmm %xmm7, 0
; AVX1-NEXT: .seh_endprologue
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX1-NEXT: #APP
@@ -1442,32 +1441,31 @@ define dso_local void @loopclearance2(ptr nocapture %y, ptr %x, double %c1, doub
; AVX1-NEXT: #NO_APP
; AVX1-NEXT: #APP
; AVX1-NEXT: #NO_APP
-; AVX1-NEXT: movl $1, %r9d
-; AVX1-NEXT: xorl %r11d, %r11d
+; AVX1-NEXT: movl $1, %r8d
+; AVX1-NEXT: xorl %r9d, %r9d
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB13_1: # %inner_loop
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT: movq %r11, %r10
-; AVX1-NEXT: movq %r11, %rax
-; AVX1-NEXT: shrq $6, %rax
-; AVX1-NEXT: movq (%rdx,%rax,8), %rax
-; AVX1-NEXT: btq %r11, %rax
-; AVX1-NEXT: leaq 1(%r11), %r11
+; AVX1-NEXT: movq %r9, %r10
+; AVX1-NEXT: shrq $6, %r10
+; AVX1-NEXT: movq (%rdx,%r10,8), %r10
+; AVX1-NEXT: btq %r9, %r10
+; AVX1-NEXT: leaq 1(%r9), %r9
; AVX1-NEXT: jae .LBB13_1
; AVX1-NEXT: # %bb.2: # %loop_end
; AVX1-NEXT: # in Loop: Header=BB13_1 Depth=1
-; AVX1-NEXT: leaq 1(%r9), %r10
+; AVX1-NEXT: leaq 1(%r8), %r10
; AVX1-NEXT: vcvtsi2sd %r10, %xmm6, %xmm4
; AVX1-NEXT: vsubsd %xmm4, %xmm2, %xmm4
; AVX1-NEXT: vmulsd %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: leaq -1(%r11), %rax
-; AVX1-NEXT: vcvtsi2sd %rax, %xmm6, %xmm5
+; AVX1-NEXT: leaq -1(%r9), %r11
+; AVX1-NEXT: vcvtsi2sd %r11, %xmm6, %xmm5
; AVX1-NEXT: vmulsd %xmm1, %xmm5, %xmm5
; AVX1-NEXT: vaddsd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vdivsd %xmm0, %xmm4, %xmm4
-; AVX1-NEXT: vmovsd %xmm4, -8(%rcx,%r9,8)
-; AVX1-NEXT: movq %r10, %r9
-; AVX1-NEXT: cmpq %r10, %r8
+; AVX1-NEXT: vmovsd %xmm4, -8(%rcx,%r8,8)
+; AVX1-NEXT: movq %r10, %r8
+; AVX1-NEXT: cmpq %r10, %rax
; AVX1-NEXT: jge .LBB13_1
; AVX1-NEXT: # %bb.3: # %loopdone
; AVX1-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload
@@ -1507,7 +1505,7 @@ define dso_local void @loopclearance2(ptr nocapture %y, ptr %x, double %c1, doub
; AVX512VL-NEXT: .seh_savexmm %xmm7, 0
; AVX512VL-NEXT: .seh_endprologue
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX512VL-NEXT: #APP
; AVX512VL-NEXT: #NO_APP
@@ -1523,32 +1521,31 @@ define dso_local void @loopclearance2(ptr nocapture %y, ptr %x, double %c1, doub
; AVX512VL-NEXT: #NO_APP
; AVX512VL-NEXT: #APP
; AVX512VL-NEXT: #NO_APP
-; AVX512VL-NEXT: movl $1, %r9d
-; AVX512VL-NEXT: xorl %r11d, %r11d
+; AVX512VL-NEXT: movl $1, %r8d
+; AVX512VL-NEXT: xorl %r9d, %r9d
; AVX512VL-NEXT: .p2align 4, 0x90
; AVX512VL-NEXT: .LBB13_1: # %inner_loop
; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512VL-NEXT: movq %r11, %r10
-; AVX512VL-NEXT: movq %r11, %rax
-; AVX512VL-NEXT: shrq $6, %rax
-; AVX512VL-NEXT: movq (%rdx,%rax,8), %rax
-; AVX512VL-NEXT: btq %r11, %rax
-; AVX512VL-NEXT: leaq 1(%r11), %r11
+; AVX512VL-NEXT: movq %r9, %r10
+; AVX512VL-NEXT: shrq $6, %r10
+; AVX512VL-NEXT: movq (%rdx,%r10,8), %r10
+; AVX512VL-NEXT: btq %r9, %r10
+; AVX512VL-NEXT: leaq 1(%r9), %r9
; AVX512VL-NEXT: jae .LBB13_1
; AVX512VL-NEXT: # %bb.2: # %loop_end
; AVX512VL-NEXT: # in Loop: Header=BB13_1 Depth=1
-; AVX512VL-NEXT: leaq 1(%r9), %r10
+; AVX512VL-NEXT: leaq 1(%r8), %r10
; AVX512VL-NEXT: vcvtsi2sd %r10, %xmm6, %xmm4
; AVX512VL-NEXT: vsubsd %xmm4, %xmm2, %xmm4
; AVX512VL-NEXT: vmulsd %xmm3, %xmm4, %xmm4
-; AVX512VL-NEXT: leaq -1(%r11), %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm6, %xmm5
+; AVX512VL-NEXT: leaq -1(%r9), %r11
+; AVX512VL-NEXT: vcvtsi2sd %r11, %xmm6, %xmm5
; AVX512VL-NEXT: vmulsd %xmm1, %xmm5, %xmm5
; AVX512VL-NEXT: vaddsd %xmm5, %xmm4, %xmm4
; AVX512VL-NEXT: vdivsd %xmm0, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovsd %xmm4, -8(%rcx,%r9,8)
-; AVX512VL-NEXT: movq %r10, %r9
-; AVX512VL-NEXT: cmpq %r10, %r8
+; AVX512VL-NEXT: vmovsd %xmm4, -8(%rcx,%r8,8)
+; AVX512VL-NEXT: movq %r10, %r8
+; AVX512VL-NEXT: cmpq %r10, %rax
; AVX512VL-NEXT: jge .LBB13_1
; AVX512VL-NEXT: # %bb.3: # %loopdone
; AVX512VL-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll
index 80b04d479e3c8..e965c621337c5 100644
--- a/llvm/test/CodeGen/X86/bswap.ll
+++ b/llvm/test/CodeGen/X86/bswap.ll
@@ -355,13 +355,13 @@ define i528 @large_promotion(i528 %A) nounwind {
; CHECK64-NEXT: movq %rdi, %rax
; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK64-NEXT: bswapq %r10
+; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; CHECK64-NEXT: bswapq %rdi
-; CHECK64-NEXT: shrdq $48, %rdi, %r10
+; CHECK64-NEXT: bswapq %r10
+; CHECK64-NEXT: shrdq $48, %r10, %rdi
; CHECK64-NEXT: bswapq %r11
-; CHECK64-NEXT: shrdq $48, %r11, %rdi
+; CHECK64-NEXT: shrdq $48, %r11, %r10
; CHECK64-NEXT: bswapq %rbx
; CHECK64-NEXT: shrdq $48, %rbx, %r11
; CHECK64-NEXT: bswapq %r9
@@ -381,8 +381,8 @@ define i528 @large_promotion(i528 %A) nounwind {
; CHECK64-NEXT: movq %r9, 32(%rax)
; CHECK64-NEXT: movq %rbx, 24(%rax)
; CHECK64-NEXT: movq %r11, 16(%rax)
-; CHECK64-NEXT: movq %rdi, 8(%rax)
-; CHECK64-NEXT: movq %r10, (%rax)
+; CHECK64-NEXT: movq %r10, 8(%rax)
+; CHECK64-NEXT: movq %rdi, (%rax)
; CHECK64-NEXT: movw %si, 64(%rax)
; CHECK64-NEXT: popq %rbx
; CHECK64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll b/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll
index 04604316160df..941f331833c19 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll
+++ b/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll
@@ -18,34 +18,34 @@ define i32 @foo(i32 %arg, ptr %arg3) nounwind {
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: movabsq $-2305847407260205056, %rbx # imm = 0xDFFFFC0000000000
+; CHECK-NEXT: movabsq $-2305847407260205056, %r14 # imm = 0xDFFFFC0000000000
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB0_5
; CHECK-NEXT: # %bb.1: # %bb5
-; CHECK-NEXT: movq %rsi, %r14
+; CHECK-NEXT: movq %rsi, %rbx
; CHECK-NEXT: movslq %edi, %rbp
; CHECK-NEXT: leaq (,%rbp,8), %rax
-; CHECK-NEXT: leaq global(%rax,%rax,2), %r15
-; CHECK-NEXT: leaq global+4(%rax,%rax,2), %r12
+; CHECK-NEXT: leaq global(%rax,%rax,2), %r14
+; CHECK-NEXT: leaq global+4(%rax,%rax,2), %r15
; CHECK-NEXT: xorl %r13d, %r13d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %bb8
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: callq bar at PLT
-; CHECK-NEXT: movq %rax, %rbx
+; CHECK-NEXT: movq %rax, %r12
; CHECK-NEXT: movq %rax, %rdi
-; CHECK-NEXT: callq *%r14
-; CHECK-NEXT: movq %r15, %rdi
+; CHECK-NEXT: callq *%rbx
+; CHECK-NEXT: movq %r14, %rdi
; CHECK-NEXT: callq hoge at PLT
-; CHECK-NEXT: movq %r12, %rdi
+; CHECK-NEXT: movq %r15, %rdi
; CHECK-NEXT: callq hoge at PLT
; CHECK-NEXT: testb %r13b, %r13b
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: # %bb.3: # %bb15
; CHECK-NEXT: leaq (%rbp,%rbp,2), %rax
-; CHECK-NEXT: movq %rbx, global+16(,%rax,8)
-; CHECK-NEXT: movabsq $-2305847407260205056, %rbx # imm = 0xDFFFFC0000000000
+; CHECK-NEXT: movq %r12, global+16(,%rax,8)
+; CHECK-NEXT: movabsq $-2305847407260205056, %r14 # imm = 0xDFFFFC0000000000
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: # %bb.4: # %bb17
@@ -53,7 +53,7 @@ define i32 @foo(i32 %arg, ptr %arg3) nounwind {
; CHECK-NEXT: .LBB0_5: # Block address taken
; CHECK-NEXT: # %bb18
; CHECK-NEXT: # Label of block must be emitted
-; CHECK-NEXT: movw $0, 14(%rbx)
+; CHECK-NEXT: movw $0, 14(%r14)
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
diff --git a/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll b/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
index 6afde98d1508d..7ea3e2ce0a7ba 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
+++ b/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
@@ -16,42 +16,40 @@ define dso_local void @n(ptr %o, i32 %p, i32 %u) nounwind {
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: movl %edx, %ebx
+; CHECK-NEXT: movl %edx, %ebp
; CHECK-NEXT: movl %esi, %r12d
-; CHECK-NEXT: movq %rdi, %r15
+; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: callq c
-; CHECK-NEXT: movl %eax, %r13d
-; CHECK-NEXT: movq %r15, %rdi
+; CHECK-NEXT: movl %eax, %r14d
+; CHECK-NEXT: movq %rbx, %rdi
; CHECK-NEXT: callq l
; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: jne .LBB0_9
; CHECK-NEXT: # %bb.1: # %if.end
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: cmpl $0, e(%rip)
-; CHECK-NEXT: # implicit-def: $ebx
-; CHECK-NEXT: # implicit-def: $r14d
+; CHECK-NEXT: # implicit-def: $r15d
+; CHECK-NEXT: # implicit-def: $r13d
; CHECK-NEXT: je .LBB0_4
; CHECK-NEXT: # %bb.2: # %if.then4
; CHECK-NEXT: movslq %r12d, %rdi
; CHECK-NEXT: callq m
-; CHECK-NEXT: # implicit-def: $ebx
-; CHECK-NEXT: # implicit-def: $ebp
+; CHECK-NEXT: # implicit-def: $r15d
+; CHECK-NEXT: # implicit-def: $r12d
; CHECK-NEXT: .LBB0_3: # %r
; CHECK-NEXT: callq c
-; CHECK-NEXT: movl %ebp, %r14d
+; CHECK-NEXT: movl %r12d, %r13d
; CHECK-NEXT: .LBB0_4: # %if.end8
-; CHECK-NEXT: movl %ebx, %edi
+; CHECK-NEXT: movl %r15d, %edi
; CHECK-NEXT: callq i
-; CHECK-NEXT: movl %eax, %ebp
-; CHECK-NEXT: orl %r14d, %ebp
-; CHECK-NEXT: andl $4, %ebx
-; CHECK-NEXT: testl %r13d, %r13d
+; CHECK-NEXT: movl %eax, %r12d
+; CHECK-NEXT: orl %r13d, %r12d
+; CHECK-NEXT: andl $4, %r15d
+; CHECK-NEXT: testl %r14d, %r14d
; CHECK-NEXT: jne .LBB0_3
; CHECK-NEXT: # %bb.5: # %if.end12
-; CHECK-NEXT: testl %ebp, %ebp
+; CHECK-NEXT: testl %r12d, %r12d
; CHECK-NEXT: je .LBB0_8
; CHECK-NEXT: # %bb.6: # %if.then14
-; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: jmp .LBB0_9
@@ -59,9 +57,9 @@ define dso_local void @n(ptr %o, i32 %p, i32 %u) nounwind {
; CHECK-NEXT: # %if.then20.critedge
; CHECK-NEXT: # Label of block must be emitted
; CHECK-NEXT: movl j(%rip), %edi
-; CHECK-NEXT: movslq %eax, %rcx
+; CHECK-NEXT: movslq %ebp, %rcx
; CHECK-NEXT: movl $1, %esi
-; CHECK-NEXT: movq %r15, %rdx
+; CHECK-NEXT: movq %rbx, %rdx
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
diff --git a/llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll b/llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll
index a8c2db5f1c4c4..43776bfac4628 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll
+++ b/llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll
@@ -15,14 +15,14 @@ define void @test1(ptr %arg, ptr %mem) nounwind {
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: movq %rsi, %r14
+; CHECK-NEXT: movq %rsi, %rbx
; CHECK-NEXT: .LBB0_1: # Block address taken
; CHECK-NEXT: # %loop
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: # Label of block must be emitted
-; CHECK-NEXT: movq (%r14), %rbx
+; CHECK-NEXT: movq (%rbx), %r14
; CHECK-NEXT: callq foo at PLT
-; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: movq %r14, %rdi
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: # %bb.2: # %end
diff --git a/llvm/test/CodeGen/X86/cgp-usubo.ll b/llvm/test/CodeGen/X86/cgp-usubo.ll
index 7979689211c3b..697df09ea28f3 100644
--- a/llvm/test/CodeGen/X86/cgp-usubo.ll
+++ b/llvm/test/CodeGen/X86/cgp-usubo.ll
@@ -171,18 +171,18 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwin
; CHECK-NEXT: testb $1, %bpl
; CHECK-NEXT: je .LBB9_2
; CHECK-NEXT: # %bb.1: # %t
-; CHECK-NEXT: movq %rdx, %r14
+; CHECK-NEXT: movq %rdx, %rbx
; CHECK-NEXT: movq %rsi, %r15
-; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rdi, %r14
; CHECK-NEXT: xorl %edi, %edi
-; CHECK-NEXT: cmpq %rsi, %rbx
+; CHECK-NEXT: cmpq %rsi, %r14
; CHECK-NEXT: setb %dil
; CHECK-NEXT: callq call at PLT
-; CHECK-NEXT: subq %r15, %rbx
+; CHECK-NEXT: subq %r15, %r14
; CHECK-NEXT: jae .LBB9_2
; CHECK-NEXT: # %bb.4: # %end
; CHECK-NEXT: setb %al
-; CHECK-NEXT: movq %rbx, (%r14)
+; CHECK-NEXT: movq %r14, (%rbx)
; CHECK-NEXT: jmp .LBB9_3
; CHECK-NEXT: .LBB9_2: # %f
; CHECK-NEXT: movl %ebp, %eax
diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll
index 229a94bb399f1..2875b03fe1c29 100644
--- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -693,52 +693,52 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm1, %r10
-; SSE2-NEXT: movq %r10, %r8
-; SSE2-NEXT: shrq $56, %r8
+; SSE2-NEXT: movq %xmm1, %rdx
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $56, %rax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movq %rdx, %rcx
+; SSE2-NEXT: shrq $48, %rcx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movq %rdx, %rsi
+; SSE2-NEXT: shrq $40, %rsi
+; SSE2-NEXT: andl $15, %esi
+; SSE2-NEXT: movq %rdx, %r8
+; SSE2-NEXT: shrq $32, %r8
; SSE2-NEXT: andl $15, %r8d
+; SSE2-NEXT: movq %xmm0, %r10
+; SSE2-NEXT: movq %r10, %rdi
+; SSE2-NEXT: shrq $56, %rdi
+; SSE2-NEXT: andl $15, %edi
; SSE2-NEXT: movq %r10, %r9
; SSE2-NEXT: shrq $48, %r9
; SSE2-NEXT: andl $15, %r9d
-; SSE2-NEXT: movq %r10, %rsi
-; SSE2-NEXT: shrq $40, %rsi
-; SSE2-NEXT: andl $15, %esi
; SSE2-NEXT: movq %r10, %r11
-; SSE2-NEXT: shrq $32, %r11
+; SSE2-NEXT: shrq $40, %r11
; SSE2-NEXT: andl $15, %r11d
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: movq %rax, %rdx
-; SSE2-NEXT: shrq $56, %rdx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq $48, %rcx
-; SSE2-NEXT: andl $15, %ecx
-; SSE2-NEXT: movq %rax, %rdi
-; SSE2-NEXT: shrq $40, %rdi
-; SSE2-NEXT: andl $15, %edi
-; SSE2-NEXT: movq %rax, %rbx
+; SSE2-NEXT: movq %r10, %rbx
; SSE2-NEXT: shrq $32, %rbx
; SSE2-NEXT: andl $15, %ebx
; SSE2-NEXT: shlq $32, %rbx
-; SSE2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; SSE2-NEXT: orq %rbx, %rax
-; SSE2-NEXT: shlq $40, %rdi
-; SSE2-NEXT: orq %rax, %rdi
-; SSE2-NEXT: shlq $48, %rcx
-; SSE2-NEXT: orq %rdi, %rcx
-; SSE2-NEXT: shlq $56, %rdx
-; SSE2-NEXT: orq %rcx, %rdx
-; SSE2-NEXT: shlq $32, %r11
; SSE2-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F
-; SSE2-NEXT: orq %r11, %r10
-; SSE2-NEXT: shlq $40, %rsi
-; SSE2-NEXT: orq %r10, %rsi
+; SSE2-NEXT: orq %rbx, %r10
+; SSE2-NEXT: shlq $40, %r11
+; SSE2-NEXT: orq %r10, %r11
; SSE2-NEXT: shlq $48, %r9
-; SSE2-NEXT: orq %rsi, %r9
-; SSE2-NEXT: shlq $56, %r8
-; SSE2-NEXT: orq %r9, %r8
-; SSE2-NEXT: movq %rdx, %xmm0
-; SSE2-NEXT: movq %r8, %xmm1
+; SSE2-NEXT: orq %r11, %r9
+; SSE2-NEXT: shlq $56, %rdi
+; SSE2-NEXT: orq %r9, %rdi
+; SSE2-NEXT: shlq $32, %r8
+; SSE2-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; SSE2-NEXT: orq %r8, %rdx
+; SSE2-NEXT: shlq $40, %rsi
+; SSE2-NEXT: orq %rdx, %rsi
+; SSE2-NEXT: shlq $48, %rcx
+; SSE2-NEXT: orq %rsi, %rcx
+; SSE2-NEXT: shlq $56, %rax
+; SSE2-NEXT: orq %rcx, %rax
+; SSE2-NEXT: movq %rdi, %xmm0
+; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
@@ -746,52 +746,52 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
; SSE42-LABEL: _clearupper16xi8b:
; SSE42: # %bb.0:
; SSE42-NEXT: pushq %rbx
-; SSE42-NEXT: pextrq $1, %xmm0, %r10
-; SSE42-NEXT: movq %r10, %r8
-; SSE42-NEXT: shrq $56, %r8
+; SSE42-NEXT: pextrq $1, %xmm0, %rdx
+; SSE42-NEXT: movq %rdx, %rax
+; SSE42-NEXT: shrq $56, %rax
+; SSE42-NEXT: andl $15, %eax
+; SSE42-NEXT: movq %rdx, %rcx
+; SSE42-NEXT: shrq $48, %rcx
+; SSE42-NEXT: andl $15, %ecx
+; SSE42-NEXT: movq %rdx, %rsi
+; SSE42-NEXT: shrq $40, %rsi
+; SSE42-NEXT: andl $15, %esi
+; SSE42-NEXT: movq %rdx, %r8
+; SSE42-NEXT: shrq $32, %r8
; SSE42-NEXT: andl $15, %r8d
+; SSE42-NEXT: movq %xmm0, %r10
+; SSE42-NEXT: movq %r10, %rdi
+; SSE42-NEXT: shrq $56, %rdi
+; SSE42-NEXT: andl $15, %edi
; SSE42-NEXT: movq %r10, %r9
; SSE42-NEXT: shrq $48, %r9
; SSE42-NEXT: andl $15, %r9d
-; SSE42-NEXT: movq %r10, %rsi
-; SSE42-NEXT: shrq $40, %rsi
-; SSE42-NEXT: andl $15, %esi
; SSE42-NEXT: movq %r10, %r11
-; SSE42-NEXT: shrq $32, %r11
+; SSE42-NEXT: shrq $40, %r11
; SSE42-NEXT: andl $15, %r11d
-; SSE42-NEXT: movq %xmm0, %rax
-; SSE42-NEXT: movq %rax, %rdx
-; SSE42-NEXT: shrq $56, %rdx
-; SSE42-NEXT: andl $15, %edx
-; SSE42-NEXT: movq %rax, %rcx
-; SSE42-NEXT: shrq $48, %rcx
-; SSE42-NEXT: andl $15, %ecx
-; SSE42-NEXT: movq %rax, %rdi
-; SSE42-NEXT: shrq $40, %rdi
-; SSE42-NEXT: andl $15, %edi
-; SSE42-NEXT: movq %rax, %rbx
+; SSE42-NEXT: movq %r10, %rbx
; SSE42-NEXT: shrq $32, %rbx
; SSE42-NEXT: andl $15, %ebx
; SSE42-NEXT: shlq $32, %rbx
-; SSE42-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; SSE42-NEXT: orq %rbx, %rax
-; SSE42-NEXT: shlq $40, %rdi
-; SSE42-NEXT: orq %rax, %rdi
-; SSE42-NEXT: shlq $48, %rcx
-; SSE42-NEXT: orq %rdi, %rcx
-; SSE42-NEXT: shlq $56, %rdx
-; SSE42-NEXT: orq %rcx, %rdx
-; SSE42-NEXT: shlq $32, %r11
; SSE42-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F
-; SSE42-NEXT: orq %r11, %r10
-; SSE42-NEXT: shlq $40, %rsi
-; SSE42-NEXT: orq %r10, %rsi
+; SSE42-NEXT: orq %rbx, %r10
+; SSE42-NEXT: shlq $40, %r11
+; SSE42-NEXT: orq %r10, %r11
; SSE42-NEXT: shlq $48, %r9
-; SSE42-NEXT: orq %rsi, %r9
-; SSE42-NEXT: shlq $56, %r8
-; SSE42-NEXT: orq %r9, %r8
-; SSE42-NEXT: movq %r8, %xmm1
-; SSE42-NEXT: movq %rdx, %xmm0
+; SSE42-NEXT: orq %r11, %r9
+; SSE42-NEXT: shlq $56, %rdi
+; SSE42-NEXT: orq %r9, %rdi
+; SSE42-NEXT: shlq $32, %r8
+; SSE42-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; SSE42-NEXT: orq %r8, %rdx
+; SSE42-NEXT: shlq $40, %rsi
+; SSE42-NEXT: orq %rdx, %rsi
+; SSE42-NEXT: shlq $48, %rcx
+; SSE42-NEXT: orq %rsi, %rcx
+; SSE42-NEXT: shlq $56, %rax
+; SSE42-NEXT: orq %rcx, %rax
+; SSE42-NEXT: movq %rax, %xmm1
+; SSE42-NEXT: movq %rdi, %xmm0
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE42-NEXT: popq %rbx
; SSE42-NEXT: retq
@@ -800,52 +800,52 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: pushq %rbx
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %r9
; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
-; AVX-NEXT: movq %r9, %r8
-; AVX-NEXT: shrq $56, %r8
-; AVX-NEXT: andl $15, %r8d
-; AVX-NEXT: movq %r9, %r10
-; AVX-NEXT: shrq $48, %r10
-; AVX-NEXT: andl $15, %r10d
-; AVX-NEXT: movq %r9, %rsi
-; AVX-NEXT: shrq $40, %rsi
-; AVX-NEXT: andl $15, %esi
-; AVX-NEXT: movq %r9, %r11
-; AVX-NEXT: shrq $32, %r11
-; AVX-NEXT: andl $15, %r11d
-; AVX-NEXT: movq %rdx, %rdi
-; AVX-NEXT: shrq $56, %rdi
-; AVX-NEXT: andl $15, %edi
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $48, %rax
+; AVX-NEXT: shrq $56, %rax
; AVX-NEXT: andl $15, %eax
; AVX-NEXT: movq %rdx, %rcx
-; AVX-NEXT: shrq $40, %rcx
+; AVX-NEXT: shrq $48, %rcx
; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: movq %rdx, %rbx
+; AVX-NEXT: movq %rdx, %rsi
+; AVX-NEXT: shrq $40, %rsi
+; AVX-NEXT: andl $15, %esi
+; AVX-NEXT: movq %rdx, %r8
+; AVX-NEXT: shrq $32, %r8
+; AVX-NEXT: andl $15, %r8d
+; AVX-NEXT: movq %rdi, %r9
+; AVX-NEXT: shrq $56, %r9
+; AVX-NEXT: andl $15, %r9d
+; AVX-NEXT: movq %rdi, %r10
+; AVX-NEXT: shrq $48, %r10
+; AVX-NEXT: andl $15, %r10d
+; AVX-NEXT: movq %rdi, %r11
+; AVX-NEXT: shrq $40, %r11
+; AVX-NEXT: andl $15, %r11d
+; AVX-NEXT: movq %rdi, %rbx
; AVX-NEXT: shrq $32, %rbx
; AVX-NEXT: andl $15, %ebx
; AVX-NEXT: shlq $32, %rbx
+; AVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
+; AVX-NEXT: orq %rbx, %rdi
+; AVX-NEXT: shlq $40, %r11
+; AVX-NEXT: orq %rdi, %r11
+; AVX-NEXT: shlq $48, %r10
+; AVX-NEXT: orq %r11, %r10
+; AVX-NEXT: shlq $56, %r9
+; AVX-NEXT: orq %r10, %r9
+; AVX-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shlq $32, %r8
; AVX-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
-; AVX-NEXT: orq %rbx, %rdx
-; AVX-NEXT: shlq $40, %rcx
-; AVX-NEXT: orq %rdx, %rcx
-; AVX-NEXT: shlq $48, %rax
-; AVX-NEXT: orq %rcx, %rax
-; AVX-NEXT: shlq $56, %rdi
-; AVX-NEXT: orq %rax, %rdi
-; AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: shlq $32, %r11
-; AVX-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F
-; AVX-NEXT: orq %r11, %r9
+; AVX-NEXT: orq %r8, %rdx
; AVX-NEXT: shlq $40, %rsi
-; AVX-NEXT: orq %r9, %rsi
-; AVX-NEXT: shlq $48, %r10
-; AVX-NEXT: orq %rsi, %r10
-; AVX-NEXT: shlq $56, %r8
-; AVX-NEXT: orq %r10, %r8
-; AVX-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: shlq $48, %rcx
+; AVX-NEXT: orq %rsi, %rcx
+; AVX-NEXT: shlq $56, %rax
+; AVX-NEXT: orq %rcx, %rax
+; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; AVX-NEXT: popq %rbx
; AVX-NEXT: retq
@@ -875,52 +875,52 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm2, %r10
-; SSE2-NEXT: movq %r10, %r8
-; SSE2-NEXT: shrq $56, %r8
+; SSE2-NEXT: movq %xmm2, %rdx
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $56, %rax
+; SSE2-NEXT: andl $15, %eax
+; SSE2-NEXT: movq %rdx, %rcx
+; SSE2-NEXT: shrq $48, %rcx
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movq %rdx, %rsi
+; SSE2-NEXT: shrq $40, %rsi
+; SSE2-NEXT: andl $15, %esi
+; SSE2-NEXT: movq %rdx, %r8
+; SSE2-NEXT: shrq $32, %r8
; SSE2-NEXT: andl $15, %r8d
+; SSE2-NEXT: movq %xmm0, %r10
+; SSE2-NEXT: movq %r10, %rdi
+; SSE2-NEXT: shrq $56, %rdi
+; SSE2-NEXT: andl $15, %edi
; SSE2-NEXT: movq %r10, %r9
; SSE2-NEXT: shrq $48, %r9
; SSE2-NEXT: andl $15, %r9d
-; SSE2-NEXT: movq %r10, %rsi
-; SSE2-NEXT: shrq $40, %rsi
-; SSE2-NEXT: andl $15, %esi
; SSE2-NEXT: movq %r10, %r11
-; SSE2-NEXT: shrq $32, %r11
+; SSE2-NEXT: shrq $40, %r11
; SSE2-NEXT: andl $15, %r11d
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: movq %rax, %rdx
-; SSE2-NEXT: shrq $56, %rdx
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq $48, %rcx
-; SSE2-NEXT: andl $15, %ecx
-; SSE2-NEXT: movq %rax, %rdi
-; SSE2-NEXT: shrq $40, %rdi
-; SSE2-NEXT: andl $15, %edi
-; SSE2-NEXT: movq %rax, %rbx
+; SSE2-NEXT: movq %r10, %rbx
; SSE2-NEXT: shrq $32, %rbx
; SSE2-NEXT: andl $15, %ebx
; SSE2-NEXT: shlq $32, %rbx
-; SSE2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; SSE2-NEXT: orq %rbx, %rax
-; SSE2-NEXT: shlq $40, %rdi
-; SSE2-NEXT: orq %rax, %rdi
-; SSE2-NEXT: shlq $48, %rcx
-; SSE2-NEXT: orq %rdi, %rcx
-; SSE2-NEXT: shlq $56, %rdx
-; SSE2-NEXT: orq %rcx, %rdx
-; SSE2-NEXT: shlq $32, %r11
; SSE2-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F
-; SSE2-NEXT: orq %r11, %r10
-; SSE2-NEXT: shlq $40, %rsi
-; SSE2-NEXT: orq %r10, %rsi
+; SSE2-NEXT: orq %rbx, %r10
+; SSE2-NEXT: shlq $40, %r11
+; SSE2-NEXT: orq %r10, %r11
; SSE2-NEXT: shlq $48, %r9
-; SSE2-NEXT: orq %rsi, %r9
-; SSE2-NEXT: shlq $56, %r8
-; SSE2-NEXT: orq %r9, %r8
-; SSE2-NEXT: movq %rdx, %xmm0
-; SSE2-NEXT: movq %r8, %xmm2
+; SSE2-NEXT: orq %r11, %r9
+; SSE2-NEXT: shlq $56, %rdi
+; SSE2-NEXT: orq %r9, %rdi
+; SSE2-NEXT: shlq $32, %r8
+; SSE2-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; SSE2-NEXT: orq %r8, %rdx
+; SSE2-NEXT: shlq $40, %rsi
+; SSE2-NEXT: orq %rdx, %rsi
+; SSE2-NEXT: shlq $48, %rcx
+; SSE2-NEXT: orq %rsi, %rcx
+; SSE2-NEXT: shlq $56, %rax
+; SSE2-NEXT: orq %rcx, %rax
+; SSE2-NEXT: movq %rdi, %xmm0
+; SSE2-NEXT: movq %rax, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
@@ -928,52 +928,52 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; SSE42-LABEL: _clearupper32xi8b:
; SSE42: # %bb.0:
; SSE42-NEXT: pushq %rbx
-; SSE42-NEXT: pextrq $1, %xmm0, %r10
-; SSE42-NEXT: movq %r10, %r8
-; SSE42-NEXT: shrq $56, %r8
+; SSE42-NEXT: pextrq $1, %xmm0, %rdx
+; SSE42-NEXT: movq %rdx, %rax
+; SSE42-NEXT: shrq $56, %rax
+; SSE42-NEXT: andl $15, %eax
+; SSE42-NEXT: movq %rdx, %rcx
+; SSE42-NEXT: shrq $48, %rcx
+; SSE42-NEXT: andl $15, %ecx
+; SSE42-NEXT: movq %rdx, %rsi
+; SSE42-NEXT: shrq $40, %rsi
+; SSE42-NEXT: andl $15, %esi
+; SSE42-NEXT: movq %rdx, %r8
+; SSE42-NEXT: shrq $32, %r8
; SSE42-NEXT: andl $15, %r8d
+; SSE42-NEXT: movq %xmm0, %r10
+; SSE42-NEXT: movq %r10, %rdi
+; SSE42-NEXT: shrq $56, %rdi
+; SSE42-NEXT: andl $15, %edi
; SSE42-NEXT: movq %r10, %r9
; SSE42-NEXT: shrq $48, %r9
; SSE42-NEXT: andl $15, %r9d
-; SSE42-NEXT: movq %r10, %rsi
-; SSE42-NEXT: shrq $40, %rsi
-; SSE42-NEXT: andl $15, %esi
; SSE42-NEXT: movq %r10, %r11
-; SSE42-NEXT: shrq $32, %r11
+; SSE42-NEXT: shrq $40, %r11
; SSE42-NEXT: andl $15, %r11d
-; SSE42-NEXT: movq %xmm0, %rax
-; SSE42-NEXT: movq %rax, %rdx
-; SSE42-NEXT: shrq $56, %rdx
-; SSE42-NEXT: andl $15, %edx
-; SSE42-NEXT: movq %rax, %rcx
-; SSE42-NEXT: shrq $48, %rcx
-; SSE42-NEXT: andl $15, %ecx
-; SSE42-NEXT: movq %rax, %rdi
-; SSE42-NEXT: shrq $40, %rdi
-; SSE42-NEXT: andl $15, %edi
-; SSE42-NEXT: movq %rax, %rbx
+; SSE42-NEXT: movq %r10, %rbx
; SSE42-NEXT: shrq $32, %rbx
; SSE42-NEXT: andl $15, %ebx
; SSE42-NEXT: shlq $32, %rbx
-; SSE42-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; SSE42-NEXT: orq %rbx, %rax
-; SSE42-NEXT: shlq $40, %rdi
-; SSE42-NEXT: orq %rax, %rdi
-; SSE42-NEXT: shlq $48, %rcx
-; SSE42-NEXT: orq %rdi, %rcx
-; SSE42-NEXT: shlq $56, %rdx
-; SSE42-NEXT: orq %rcx, %rdx
-; SSE42-NEXT: shlq $32, %r11
; SSE42-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F
-; SSE42-NEXT: orq %r11, %r10
-; SSE42-NEXT: shlq $40, %rsi
-; SSE42-NEXT: orq %r10, %rsi
+; SSE42-NEXT: orq %rbx, %r10
+; SSE42-NEXT: shlq $40, %r11
+; SSE42-NEXT: orq %r10, %r11
; SSE42-NEXT: shlq $48, %r9
-; SSE42-NEXT: orq %rsi, %r9
-; SSE42-NEXT: shlq $56, %r8
-; SSE42-NEXT: orq %r9, %r8
-; SSE42-NEXT: movq %r8, %xmm2
-; SSE42-NEXT: movq %rdx, %xmm0
+; SSE42-NEXT: orq %r11, %r9
+; SSE42-NEXT: shlq $56, %rdi
+; SSE42-NEXT: orq %r9, %rdi
+; SSE42-NEXT: shlq $32, %r8
+; SSE42-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; SSE42-NEXT: orq %r8, %rdx
+; SSE42-NEXT: shlq $40, %rsi
+; SSE42-NEXT: orq %rdx, %rsi
+; SSE42-NEXT: shlq $48, %rcx
+; SSE42-NEXT: orq %rsi, %rcx
+; SSE42-NEXT: shlq $56, %rax
+; SSE42-NEXT: orq %rcx, %rax
+; SSE42-NEXT: movq %rax, %xmm2
+; SSE42-NEXT: movq %rdi, %xmm0
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE42-NEXT: popq %rbx
; SSE42-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index f0e90b11020f9..806816ec8ea31 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -476,11 +476,11 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) {
; SSE: # %bb.0: # %start
; SSE-NEXT: movslq %esi, %rax
; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
-; SSE-NEXT: movdqa %xmm9, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: psrlq $32, %xmm3
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: .p2align 4, 0x90
; SSE-NEXT: .LBB8_1: # %loop
@@ -489,44 +489,44 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) {
; SSE-NEXT: pmovsxdq 2097168(%rdi,%rax), %xmm4
; SSE-NEXT: pmovsxdq 2097152(%rdi,%rax), %xmm6
; SSE-NEXT: pmovsxdq 2097160(%rdi,%rax), %xmm7
-; SSE-NEXT: movdqa %xmm8, %xmm3
-; SSE-NEXT: pmuludq %xmm7, %xmm3
-; SSE-NEXT: movdqa %xmm9, %xmm2
-; SSE-NEXT: pmuludq %xmm7, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm8
+; SSE-NEXT: pmuludq %xmm7, %xmm8
+; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: pmuludq %xmm7, %xmm9
; SSE-NEXT: psrlq $32, %xmm7
-; SSE-NEXT: pmuludq %xmm9, %xmm7
-; SSE-NEXT: paddq %xmm3, %xmm7
+; SSE-NEXT: pmuludq %xmm2, %xmm7
+; SSE-NEXT: paddq %xmm8, %xmm7
; SSE-NEXT: psllq $32, %xmm7
-; SSE-NEXT: paddq %xmm2, %xmm7
-; SSE-NEXT: movdqa %xmm8, %xmm2
-; SSE-NEXT: pmuludq %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm9, %xmm3
-; SSE-NEXT: pmuludq %xmm6, %xmm3
+; SSE-NEXT: paddq %xmm9, %xmm7
+; SSE-NEXT: movdqa %xmm3, %xmm8
+; SSE-NEXT: pmuludq %xmm6, %xmm8
+; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: pmuludq %xmm6, %xmm9
; SSE-NEXT: psrlq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm9, %xmm6
-; SSE-NEXT: paddq %xmm2, %xmm6
+; SSE-NEXT: pmuludq %xmm2, %xmm6
+; SSE-NEXT: paddq %xmm8, %xmm6
; SSE-NEXT: psllq $32, %xmm6
-; SSE-NEXT: paddq %xmm3, %xmm6
+; SSE-NEXT: paddq %xmm9, %xmm6
; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm7[1,3]
; SSE-NEXT: paddd %xmm6, %xmm0
-; SSE-NEXT: movdqa %xmm4, %xmm2
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm9, %xmm2
-; SSE-NEXT: movdqa %xmm8, %xmm3
-; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: paddq %xmm2, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm9, %xmm4
-; SSE-NEXT: paddq %xmm3, %xmm4
-; SSE-NEXT: movdqa %xmm5, %xmm2
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm9, %xmm2
-; SSE-NEXT: movdqa %xmm8, %xmm3
-; SSE-NEXT: pmuludq %xmm5, %xmm3
-; SSE-NEXT: paddq %xmm2, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm9, %xmm5
-; SSE-NEXT: paddq %xmm3, %xmm5
+; SSE-NEXT: movdqa %xmm4, %xmm6
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm2, %xmm6
+; SSE-NEXT: movdqa %xmm3, %xmm7
+; SSE-NEXT: pmuludq %xmm4, %xmm7
+; SSE-NEXT: paddq %xmm6, %xmm7
+; SSE-NEXT: psllq $32, %xmm7
+; SSE-NEXT: pmuludq %xmm2, %xmm4
+; SSE-NEXT: paddq %xmm7, %xmm4
+; SSE-NEXT: movdqa %xmm5, %xmm6
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm2, %xmm6
+; SSE-NEXT: movdqa %xmm3, %xmm7
+; SSE-NEXT: pmuludq %xmm5, %xmm7
+; SSE-NEXT: paddq %xmm6, %xmm7
+; SSE-NEXT: psllq $32, %xmm7
+; SSE-NEXT: pmuludq %xmm2, %xmm5
+; SSE-NEXT: paddq %xmm7, %xmm5
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3]
; SSE-NEXT: paddd %xmm4, %xmm1
; SSE-NEXT: subq $-128, %rax
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 504cf244348ed..d9a6b6d9ff8a2 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -551,8 +551,8 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = <u,4,2,16,8,32,64,2>
-; SSE2-NEXT: pmulhuw %xmm8, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = <u,4,2,16,8,32,64,2>
+; SSE2-NEXT: pmulhuw %xmm7, %xmm0
; SSE2-NEXT: paddw %xmm3, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535]
; SSE2-NEXT: movdqa %xmm0, %xmm2
@@ -568,40 +568,40 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
; SSE2-NEXT: movdqa %xmm5, %xmm2
; SSE2-NEXT: pandn %xmm6, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,0,0,65535,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,0]
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: psraw $1, %xmm2
-; SSE2-NEXT: movdqa %xmm7, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm8
+; SSE2-NEXT: pandn %xmm2, %xmm8
+; SSE2-NEXT: por %xmm0, %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm2, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm8
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: por %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: psraw $15, %xmm3
-; SSE2-NEXT: pmulhuw %xmm8, %xmm3
+; SSE2-NEXT: pmulhuw %xmm7, %xmm3
; SSE2-NEXT: paddw %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: psraw $4, %xmm3
; SSE2-NEXT: pandn %xmm3, %xmm4
-; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: por %xmm7, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: psraw $2, %xmm4
; SSE2-NEXT: pandn %xmm4, %xmm5
; SSE2-NEXT: por %xmm3, %xmm5
; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: psraw $1, %xmm5
-; SSE2-NEXT: pandn %xmm5, %xmm7
-; SSE2-NEXT: por %xmm3, %xmm7
-; SSE2-NEXT: pand %xmm2, %xmm7
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: por %xmm3, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm6
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
@@ -709,114 +709,114 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psraw $15, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = <u,4,2,16,8,32,64,2>
; SSE2-NEXT: pmulhuw %xmm9, %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,0,0,65535]
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,0,0,65535]
; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: movdqa %xmm11, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm6, %xmm8
+; SSE2-NEXT: pandn %xmm0, %xmm8
+; SSE2-NEXT: por %xmm4, %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535]
-; SSE2-NEXT: movdqa %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm8, %xmm0
; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: psraw $2, %xmm5
+; SSE2-NEXT: psraw $2, %xmm8
; SSE2-NEXT: movdqa %xmm7, %xmm4
-; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm8, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,0,0,65535,0]
; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: psraw $1, %xmm4
-; SSE2-NEXT: movdqa %xmm10, %xmm5
-; SSE2-NEXT: pandn %xmm4, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm12, %xmm5
-; SSE2-NEXT: movdqa %xmm12, %xmm0
+; SSE2-NEXT: movdqa %xmm8, %xmm10
+; SSE2-NEXT: pandn %xmm4, %xmm10
+; SSE2-NEXT: por %xmm0, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm4, %xmm10
+; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: por %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: psraw $15, %xmm1
; SSE2-NEXT: pmulhuw %xmm9, %xmm1
-; SSE2-NEXT: paddw %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: paddw %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
; SSE2-NEXT: psraw $4, %xmm1
-; SSE2-NEXT: movdqa %xmm11, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: por %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm11
+; SSE2-NEXT: pandn %xmm1, %xmm11
+; SSE2-NEXT: por %xmm10, %xmm11
+; SSE2-NEXT: movdqa %xmm11, %xmm1
; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: psraw $2, %xmm6
-; SSE2-NEXT: movdqa %xmm7, %xmm5
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: por %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm10, %xmm1
-; SSE2-NEXT: psraw $1, %xmm5
-; SSE2-NEXT: movdqa %xmm10, %xmm6
-; SSE2-NEXT: pandn %xmm5, %xmm6
-; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pand %xmm12, %xmm6
-; SSE2-NEXT: movdqa %xmm12, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: psraw $2, %xmm11
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pandn %xmm11, %xmm10
+; SSE2-NEXT: por %xmm1, %xmm10
+; SSE2-NEXT: movdqa %xmm10, %xmm1
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: psraw $1, %xmm10
+; SSE2-NEXT: movdqa %xmm8, %xmm11
+; SSE2-NEXT: pandn %xmm10, %xmm11
+; SSE2-NEXT: por %xmm1, %xmm11
+; SSE2-NEXT: pand %xmm4, %xmm11
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm1
+; SSE2-NEXT: por %xmm11, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: psraw $15, %xmm5
; SSE2-NEXT: pmulhuw %xmm9, %xmm5
; SSE2-NEXT: paddw %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pand %xmm11, %xmm6
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
; SSE2-NEXT: psraw $4, %xmm5
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm6, %xmm11
+; SSE2-NEXT: pandn %xmm5, %xmm11
+; SSE2-NEXT: por %xmm10, %xmm11
+; SSE2-NEXT: movdqa %xmm11, %xmm5
; SSE2-NEXT: pand %xmm7, %xmm5
-; SSE2-NEXT: psraw $2, %xmm4
-; SSE2-NEXT: movdqa %xmm7, %xmm6
-; SSE2-NEXT: pandn %xmm4, %xmm6
-; SSE2-NEXT: por %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm10, %xmm4
-; SSE2-NEXT: psraw $1, %xmm6
+; SSE2-NEXT: psraw $2, %xmm11
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pandn %xmm11, %xmm10
+; SSE2-NEXT: por %xmm5, %xmm10
; SSE2-NEXT: movdqa %xmm10, %xmm5
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm12, %xmm5
-; SSE2-NEXT: movdqa %xmm12, %xmm8
-; SSE2-NEXT: pandn %xmm2, %xmm8
-; SSE2-NEXT: por %xmm5, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: psraw $1, %xmm10
+; SSE2-NEXT: movdqa %xmm8, %xmm11
+; SSE2-NEXT: pandn %xmm10, %xmm11
+; SSE2-NEXT: por %xmm5, %xmm11
+; SSE2-NEXT: pand %xmm4, %xmm11
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm5
+; SSE2-NEXT: por %xmm11, %xmm5
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: pmulhuw %xmm9, %xmm2
; SSE2-NEXT: paddw %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm9
+; SSE2-NEXT: pand %xmm6, %xmm9
; SSE2-NEXT: psraw $4, %xmm2
-; SSE2-NEXT: pandn %xmm2, %xmm11
-; SSE2-NEXT: por %xmm4, %xmm11
-; SSE2-NEXT: movdqa %xmm11, %xmm2
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: por %xmm9, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm2
; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: psraw $2, %xmm11
-; SSE2-NEXT: pandn %xmm11, %xmm7
+; SSE2-NEXT: psraw $2, %xmm6
+; SSE2-NEXT: pandn %xmm6, %xmm7
; SSE2-NEXT: por %xmm2, %xmm7
; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: psraw $1, %xmm7
-; SSE2-NEXT: pandn %xmm7, %xmm10
-; SSE2-NEXT: por %xmm2, %xmm10
-; SSE2-NEXT: pand %xmm12, %xmm10
-; SSE2-NEXT: pandn %xmm3, %xmm12
-; SSE2-NEXT: por %xmm10, %xmm12
-; SSE2-NEXT: movdqa %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm12, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm8
+; SSE2-NEXT: por %xmm2, %xmm8
+; SSE2-NEXT: pand %xmm4, %xmm8
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: por %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
diff --git a/llvm/test/CodeGen/X86/commute-fcmp.ll b/llvm/test/CodeGen/X86/commute-fcmp.ll
index 641a4740ede8e..c9b8379d81d02 100644
--- a/llvm/test/CodeGen/X86/commute-fcmp.ll
+++ b/llvm/test/CodeGen/X86/commute-fcmp.ll
@@ -896,14 +896,14 @@ define <16 x i32> @commute_cmpps_uno_zmm(ptr %a0, <16 x float> %a1) {
define <16 x i32> @commute_cmpps_ueq_zmm(ptr %a0, <16 x float> %a1) {
; SSE-LABEL: commute_cmpps_ueq_zmm:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm7
+; SSE-NEXT: movaps (%rdi), %xmm4
; SSE-NEXT: movaps 16(%rdi), %xmm5
; SSE-NEXT: movaps 32(%rdi), %xmm6
-; SSE-NEXT: movaps 48(%rdi), %xmm8
-; SSE-NEXT: movaps %xmm7, %xmm4
-; SSE-NEXT: cmpeqps %xmm0, %xmm4
-; SSE-NEXT: cmpunordps %xmm7, %xmm0
-; SSE-NEXT: orps %xmm4, %xmm0
+; SSE-NEXT: movaps 48(%rdi), %xmm7
+; SSE-NEXT: movaps %xmm4, %xmm8
+; SSE-NEXT: cmpeqps %xmm0, %xmm8
+; SSE-NEXT: cmpunordps %xmm4, %xmm0
+; SSE-NEXT: orps %xmm8, %xmm0
; SSE-NEXT: movaps %xmm5, %xmm4
; SSE-NEXT: cmpeqps %xmm1, %xmm4
; SSE-NEXT: cmpunordps %xmm5, %xmm1
@@ -912,9 +912,9 @@ define <16 x i32> @commute_cmpps_ueq_zmm(ptr %a0, <16 x float> %a1) {
; SSE-NEXT: cmpeqps %xmm2, %xmm4
; SSE-NEXT: cmpunordps %xmm6, %xmm2
; SSE-NEXT: orps %xmm4, %xmm2
-; SSE-NEXT: movaps %xmm8, %xmm4
+; SSE-NEXT: movaps %xmm7, %xmm4
; SSE-NEXT: cmpeqps %xmm3, %xmm4
-; SSE-NEXT: cmpunordps %xmm8, %xmm3
+; SSE-NEXT: cmpunordps %xmm7, %xmm3
; SSE-NEXT: orps %xmm4, %xmm3
; SSE-NEXT: retq
;
@@ -938,14 +938,14 @@ define <16 x i32> @commute_cmpps_ueq_zmm(ptr %a0, <16 x float> %a1) {
define <16 x i32> @commute_cmpps_one_zmm(ptr %a0, <16 x float> %a1) {
; SSE-LABEL: commute_cmpps_one_zmm:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm7
+; SSE-NEXT: movaps (%rdi), %xmm4
; SSE-NEXT: movaps 16(%rdi), %xmm5
; SSE-NEXT: movaps 32(%rdi), %xmm6
-; SSE-NEXT: movaps 48(%rdi), %xmm8
-; SSE-NEXT: movaps %xmm7, %xmm4
-; SSE-NEXT: cmpneqps %xmm0, %xmm4
-; SSE-NEXT: cmpordps %xmm7, %xmm0
-; SSE-NEXT: andps %xmm4, %xmm0
+; SSE-NEXT: movaps 48(%rdi), %xmm7
+; SSE-NEXT: movaps %xmm4, %xmm8
+; SSE-NEXT: cmpneqps %xmm0, %xmm8
+; SSE-NEXT: cmpordps %xmm4, %xmm0
+; SSE-NEXT: andps %xmm8, %xmm0
; SSE-NEXT: movaps %xmm5, %xmm4
; SSE-NEXT: cmpneqps %xmm1, %xmm4
; SSE-NEXT: cmpordps %xmm5, %xmm1
@@ -954,9 +954,9 @@ define <16 x i32> @commute_cmpps_one_zmm(ptr %a0, <16 x float> %a1) {
; SSE-NEXT: cmpneqps %xmm2, %xmm4
; SSE-NEXT: cmpordps %xmm6, %xmm2
; SSE-NEXT: andps %xmm4, %xmm2
-; SSE-NEXT: movaps %xmm8, %xmm4
+; SSE-NEXT: movaps %xmm7, %xmm4
; SSE-NEXT: cmpneqps %xmm3, %xmm4
-; SSE-NEXT: cmpordps %xmm8, %xmm3
+; SSE-NEXT: cmpordps %xmm7, %xmm3
; SSE-NEXT: andps %xmm4, %xmm3
; SSE-NEXT: retq
;
@@ -1156,14 +1156,14 @@ define <8 x i64> @commute_cmppd_uno_zmmm(ptr %a0, <8 x double> %a1) {
define <8 x i64> @commute_cmppd_ueq_zmmm(ptr %a0, <8 x double> %a1) {
; SSE-LABEL: commute_cmppd_ueq_zmmm:
; SSE: # %bb.0:
-; SSE-NEXT: movapd (%rdi), %xmm7
+; SSE-NEXT: movapd (%rdi), %xmm4
; SSE-NEXT: movapd 16(%rdi), %xmm5
; SSE-NEXT: movapd 32(%rdi), %xmm6
-; SSE-NEXT: movapd 48(%rdi), %xmm8
-; SSE-NEXT: movapd %xmm7, %xmm4
-; SSE-NEXT: cmpeqpd %xmm0, %xmm4
-; SSE-NEXT: cmpunordpd %xmm7, %xmm0
-; SSE-NEXT: orpd %xmm4, %xmm0
+; SSE-NEXT: movapd 48(%rdi), %xmm7
+; SSE-NEXT: movapd %xmm4, %xmm8
+; SSE-NEXT: cmpeqpd %xmm0, %xmm8
+; SSE-NEXT: cmpunordpd %xmm4, %xmm0
+; SSE-NEXT: orpd %xmm8, %xmm0
; SSE-NEXT: movapd %xmm5, %xmm4
; SSE-NEXT: cmpeqpd %xmm1, %xmm4
; SSE-NEXT: cmpunordpd %xmm5, %xmm1
@@ -1172,9 +1172,9 @@ define <8 x i64> @commute_cmppd_ueq_zmmm(ptr %a0, <8 x double> %a1) {
; SSE-NEXT: cmpeqpd %xmm2, %xmm4
; SSE-NEXT: cmpunordpd %xmm6, %xmm2
; SSE-NEXT: orpd %xmm4, %xmm2
-; SSE-NEXT: movapd %xmm8, %xmm4
+; SSE-NEXT: movapd %xmm7, %xmm4
; SSE-NEXT: cmpeqpd %xmm3, %xmm4
-; SSE-NEXT: cmpunordpd %xmm8, %xmm3
+; SSE-NEXT: cmpunordpd %xmm7, %xmm3
; SSE-NEXT: orpd %xmm4, %xmm3
; SSE-NEXT: retq
;
@@ -1198,14 +1198,14 @@ define <8 x i64> @commute_cmppd_ueq_zmmm(ptr %a0, <8 x double> %a1) {
define <8 x i64> @commute_cmppd_one_zmmm(ptr %a0, <8 x double> %a1) {
; SSE-LABEL: commute_cmppd_one_zmmm:
; SSE: # %bb.0:
-; SSE-NEXT: movapd (%rdi), %xmm7
+; SSE-NEXT: movapd (%rdi), %xmm4
; SSE-NEXT: movapd 16(%rdi), %xmm5
; SSE-NEXT: movapd 32(%rdi), %xmm6
-; SSE-NEXT: movapd 48(%rdi), %xmm8
-; SSE-NEXT: movapd %xmm7, %xmm4
-; SSE-NEXT: cmpneqpd %xmm0, %xmm4
-; SSE-NEXT: cmpordpd %xmm7, %xmm0
-; SSE-NEXT: andpd %xmm4, %xmm0
+; SSE-NEXT: movapd 48(%rdi), %xmm7
+; SSE-NEXT: movapd %xmm4, %xmm8
+; SSE-NEXT: cmpneqpd %xmm0, %xmm8
+; SSE-NEXT: cmpordpd %xmm4, %xmm0
+; SSE-NEXT: andpd %xmm8, %xmm0
; SSE-NEXT: movapd %xmm5, %xmm4
; SSE-NEXT: cmpneqpd %xmm1, %xmm4
; SSE-NEXT: cmpordpd %xmm5, %xmm1
@@ -1214,9 +1214,9 @@ define <8 x i64> @commute_cmppd_one_zmmm(ptr %a0, <8 x double> %a1) {
; SSE-NEXT: cmpneqpd %xmm2, %xmm4
; SSE-NEXT: cmpordpd %xmm6, %xmm2
; SSE-NEXT: andpd %xmm4, %xmm2
-; SSE-NEXT: movapd %xmm8, %xmm4
+; SSE-NEXT: movapd %xmm7, %xmm4
; SSE-NEXT: cmpneqpd %xmm3, %xmm4
-; SSE-NEXT: cmpordpd %xmm8, %xmm3
+; SSE-NEXT: cmpordpd %xmm7, %xmm3
; SSE-NEXT: andpd %xmm4, %xmm3
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/compact-unwind.ll b/llvm/test/CodeGen/X86/compact-unwind.ll
index dfd411d91d7b7..b9860fb855388 100644
--- a/llvm/test/CodeGen/X86/compact-unwind.ll
+++ b/llvm/test/CodeGen/X86/compact-unwind.ll
@@ -65,12 +65,12 @@ declare void @OSMemoryBarrier() optsize
; NOFP-CU: Entry at offset 0x20:
; NOFP-CU-NEXT: start: 0x1d _test1
-; NOFP-CU-NEXT: length: 0x42
+; NOFP-CU-NEXT: length: 0x44
; NOFP-CU-NEXT: compact encoding: 0x02040c0a
; NOFP-FROM-ASM: Entry at offset 0x20:
; NOFP-FROM-ASM-NEXT: start: 0x1d _test1
-; NOFP-FROM-ASM-NEXT: length: 0x42
+; NOFP-FROM-ASM-NEXT: length: 0x44
; NOFP-FROM-ASM-NEXT: compact encoding: 0x02040c0a
define void @test1(ptr %image) optsize ssp uwtable {
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index d899c20183dfb..d1ef1ab390396 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -362,14 +362,14 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; CHECK64-NEXT: movq (%rdi), %rdi # encoding: [0x48,0x8b,0x3f]
; CHECK64-NEXT: movq -24(%rdi), %rax # encoding: [0x48,0x8b,0x47,0xe8]
; CHECK64-NEXT: leaq (%rdi,%rax), %rsi # encoding: [0x48,0x8d,0x34,0x07]
-; CHECK64-NEXT: xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; CHECK64-NEXT: xorl %r8d, %r8d # encoding: [0x45,0x31,0xc0]
; CHECK64-NEXT: pushq $2 # encoding: [0x6a,0x02]
; CHECK64-NEXT: .cfi_adjust_cfa_offset 8
-; CHECK64-NEXT: popq %r9 # encoding: [0x41,0x59]
+; CHECK64-NEXT: popq %rcx # encoding: [0x59]
; CHECK64-NEXT: .cfi_adjust_cfa_offset -8
; CHECK64-NEXT: pushq $1 # encoding: [0x6a,0x01]
; CHECK64-NEXT: .cfi_adjust_cfa_offset 8
-; CHECK64-NEXT: popq %r8 # encoding: [0x41,0x58]
+; CHECK64-NEXT: popq %rdx # encoding: [0x5a]
; CHECK64-NEXT: .cfi_adjust_cfa_offset -8
; CHECK64-NEXT: .LBB3_1: # %for.cond
; CHECK64-NEXT: # =>This Inner Loop Header: Depth=1
@@ -378,56 +378,56 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_12-1, kind: FK_PCRel_1
; CHECK64-NEXT: # %bb.2: # %for.body
; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; CHECK64-NEXT: cmpl $2, %ecx # encoding: [0x83,0xf9,0x02]
+; CHECK64-NEXT: cmpl $2, %r8d # encoding: [0x41,0x83,0xf8,0x02]
; CHECK64-NEXT: je .LBB3_10 # encoding: [0x74,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
; CHECK64-NEXT: # %bb.3: # %for.body
; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; CHECK64-NEXT: cmpl $1, %ecx # encoding: [0x83,0xf9,0x01]
+; CHECK64-NEXT: cmpl $1, %r8d # encoding: [0x41,0x83,0xf8,0x01]
; CHECK64-NEXT: je .LBB3_8 # encoding: [0x74,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1
; CHECK64-NEXT: # %bb.4: # %for.body
; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; CHECK64-NEXT: testl %ecx, %ecx # encoding: [0x85,0xc9]
+; CHECK64-NEXT: testl %r8d, %r8d # encoding: [0x45,0x85,0xc0]
; CHECK64-NEXT: jne .LBB3_11 # encoding: [0x75,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
; CHECK64-NEXT: # %bb.5: # %sw.bb
; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; CHECK64-NEXT: movzbl (%rdi), %edx # encoding: [0x0f,0xb6,0x17]
-; CHECK64-NEXT: cmpl $43, %edx # encoding: [0x83,0xfa,0x2b]
-; CHECK64-NEXT: movl %r8d, %ecx # encoding: [0x44,0x89,0xc1]
+; CHECK64-NEXT: movzbl (%rdi), %r9d # encoding: [0x44,0x0f,0xb6,0x0f]
+; CHECK64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b]
+; CHECK64-NEXT: movl %edx, %r8d # encoding: [0x41,0x89,0xd0]
; CHECK64-NEXT: je .LBB3_11 # encoding: [0x74,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
; CHECK64-NEXT: # %bb.6: # %sw.bb
; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; CHECK64-NEXT: cmpl $45, %edx # encoding: [0x83,0xfa,0x2d]
-; CHECK64-NEXT: movl %r8d, %ecx # encoding: [0x44,0x89,0xc1]
+; CHECK64-NEXT: cmpl $45, %r9d # encoding: [0x41,0x83,0xf9,0x2d]
+; CHECK64-NEXT: movl %edx, %r8d # encoding: [0x41,0x89,0xd0]
; CHECK64-NEXT: je .LBB3_11 # encoding: [0x74,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
; CHECK64-NEXT: # %bb.7: # %if.else
; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; CHECK64-NEXT: addl $-48, %edx # encoding: [0x83,0xc2,0xd0]
-; CHECK64-NEXT: cmpl $10, %edx # encoding: [0x83,0xfa,0x0a]
+; CHECK64-NEXT: addl $-48, %r9d # encoding: [0x41,0x83,0xc1,0xd0]
+; CHECK64-NEXT: cmpl $10, %r9d # encoding: [0x41,0x83,0xf9,0x0a]
; CHECK64-NEXT: jmp .LBB3_9 # encoding: [0xeb,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
; CHECK64-NEXT: .LBB3_8: # %sw.bb14
; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; CHECK64-NEXT: movzbl (%rdi), %ecx # encoding: [0x0f,0xb6,0x0f]
-; CHECK64-NEXT: addl $-48, %ecx # encoding: [0x83,0xc1,0xd0]
-; CHECK64-NEXT: cmpl $10, %ecx # encoding: [0x83,0xf9,0x0a]
+; CHECK64-NEXT: movzbl (%rdi), %r8d # encoding: [0x44,0x0f,0xb6,0x07]
+; CHECK64-NEXT: addl $-48, %r8d # encoding: [0x41,0x83,0xc0,0xd0]
+; CHECK64-NEXT: cmpl $10, %r8d # encoding: [0x41,0x83,0xf8,0x0a]
; CHECK64-NEXT: .LBB3_9: # %if.else
; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; CHECK64-NEXT: movl %r9d, %ecx # encoding: [0x44,0x89,0xc9]
+; CHECK64-NEXT: movl %ecx, %r8d # encoding: [0x41,0x89,0xc8]
; CHECK64-NEXT: jb .LBB3_11 # encoding: [0x72,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
; CHECK64-NEXT: jmp .LBB3_13 # encoding: [0xeb,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_13-1, kind: FK_PCRel_1
; CHECK64-NEXT: .LBB3_10: # %sw.bb22
; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; CHECK64-NEXT: movzbl (%rdi), %ecx # encoding: [0x0f,0xb6,0x0f]
-; CHECK64-NEXT: addl $-48, %ecx # encoding: [0x83,0xc1,0xd0]
-; CHECK64-NEXT: cmpl $10, %ecx # encoding: [0x83,0xf9,0x0a]
-; CHECK64-NEXT: movl %r9d, %ecx # encoding: [0x44,0x89,0xc9]
+; CHECK64-NEXT: movzbl (%rdi), %r8d # encoding: [0x44,0x0f,0xb6,0x07]
+; CHECK64-NEXT: addl $-48, %r8d # encoding: [0x41,0x83,0xc0,0xd0]
+; CHECK64-NEXT: cmpl $10, %r8d # encoding: [0x41,0x83,0xf8,0x0a]
+; CHECK64-NEXT: movl %ecx, %r8d # encoding: [0x41,0x89,0xc8]
; CHECK64-NEXT: jae _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_ # TAILCALL
; CHECK64-NEXT: # encoding: [0x73,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_-1, kind: FK_PCRel_1
@@ -438,7 +438,7 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; CHECK64-NEXT: jmp .LBB3_1 # encoding: [0xeb,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_1-1, kind: FK_PCRel_1
; CHECK64-NEXT: .LBB3_12:
-; CHECK64-NEXT: cmpl $2, %ecx # encoding: [0x83,0xf9,0x02]
+; CHECK64-NEXT: cmpl $2, %r8d # encoding: [0x41,0x83,0xf8,0x02]
; CHECK64-NEXT: sete %al # encoding: [0x0f,0x94,0xc0]
; CHECK64-NEXT: # kill: def $al killed $al killed $eax
; CHECK64-NEXT: retq # encoding: [0xc3]
@@ -450,34 +450,34 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; WIN64-LABEL: pr31257:
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: movq (%rcx), %rcx # encoding: [0x48,0x8b,0x09]
-; WIN64-NEXT: movq -24(%rcx), %r8 # encoding: [0x4c,0x8b,0x41,0xe8]
-; WIN64-NEXT: leaq (%rcx,%r8), %rdx # encoding: [0x4a,0x8d,0x14,0x01]
-; WIN64-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; WIN64-NEXT: movq -24(%rcx), %rax # encoding: [0x48,0x8b,0x41,0xe8]
+; WIN64-NEXT: leaq (%rcx,%rax), %rdx # encoding: [0x48,0x8d,0x14,0x01]
+; WIN64-NEXT: xorl %r8d, %r8d # encoding: [0x45,0x31,0xc0]
; WIN64-NEXT: .LBB3_1: # %for.cond
; WIN64-NEXT: # =>This Inner Loop Header: Depth=1
-; WIN64-NEXT: testq %r8, %r8 # encoding: [0x4d,0x85,0xc0]
+; WIN64-NEXT: testq %rax, %rax # encoding: [0x48,0x85,0xc0]
; WIN64-NEXT: je .LBB3_11 # encoding: [0x74,A]
; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
; WIN64-NEXT: # %bb.2: # %for.body
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; WIN64-NEXT: cmpl $2, %eax # encoding: [0x83,0xf8,0x02]
+; WIN64-NEXT: cmpl $2, %r8d # encoding: [0x41,0x83,0xf8,0x02]
; WIN64-NEXT: je .LBB3_9 # encoding: [0x74,A]
; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
; WIN64-NEXT: # %bb.3: # %for.body
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; WIN64-NEXT: cmpl $1, %eax # encoding: [0x83,0xf8,0x01]
+; WIN64-NEXT: cmpl $1, %r8d # encoding: [0x41,0x83,0xf8,0x01]
; WIN64-NEXT: je .LBB3_7 # encoding: [0x74,A]
; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_7-1, kind: FK_PCRel_1
; WIN64-NEXT: # %bb.4: # %for.body
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; WIN64-NEXT: testl %eax, %eax # encoding: [0x85,0xc0]
+; WIN64-NEXT: testl %r8d, %r8d # encoding: [0x45,0x85,0xc0]
; WIN64-NEXT: jne .LBB3_10 # encoding: [0x75,A]
; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
; WIN64-NEXT: # %bb.5: # %sw.bb
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09]
; WIN64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b]
-; WIN64-NEXT: movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
+; WIN64-NEXT: movl $1, %r8d # encoding: [0x41,0xb8,0x01,0x00,0x00,0x00]
; WIN64-NEXT: je .LBB3_10 # encoding: [0x74,A]
; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
; WIN64-NEXT: # %bb.6: # %sw.bb
@@ -493,7 +493,7 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; WIN64-NEXT: .LBB3_8: # %if.else
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
; WIN64-NEXT: addl $-48, %r9d # encoding: [0x41,0x83,0xc1,0xd0]
-; WIN64-NEXT: movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00]
+; WIN64-NEXT: movl $2, %r8d # encoding: [0x41,0xb8,0x02,0x00,0x00,0x00]
; WIN64-NEXT: cmpl $10, %r9d # encoding: [0x41,0x83,0xf9,0x0a]
; WIN64-NEXT: jb .LBB3_10 # encoding: [0x72,A]
; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
@@ -503,7 +503,7 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09]
; WIN64-NEXT: addl $-48, %r9d # encoding: [0x41,0x83,0xc1,0xd0]
-; WIN64-NEXT: movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00]
+; WIN64-NEXT: movl $2, %r8d # encoding: [0x41,0xb8,0x02,0x00,0x00,0x00]
; WIN64-NEXT: cmpl $10, %r9d # encoding: [0x41,0x83,0xf9,0x0a]
; WIN64-NEXT: jae _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_ # TAILCALL
; WIN64-NEXT: # encoding: [0x73,A]
@@ -511,11 +511,11 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; WIN64-NEXT: .LBB3_10: # %for.inc
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
; WIN64-NEXT: incq %rcx # encoding: [0x48,0xff,0xc1]
-; WIN64-NEXT: decq %r8 # encoding: [0x49,0xff,0xc8]
+; WIN64-NEXT: decq %rax # encoding: [0x48,0xff,0xc8]
; WIN64-NEXT: jmp .LBB3_1 # encoding: [0xeb,A]
; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_1-1, kind: FK_PCRel_1
; WIN64-NEXT: .LBB3_11:
-; WIN64-NEXT: cmpl $2, %eax # encoding: [0x83,0xf8,0x02]
+; WIN64-NEXT: cmpl $2, %r8d # encoding: [0x41,0x83,0xf8,0x02]
; WIN64-NEXT: sete %al # encoding: [0x0f,0x94,0xc0]
; WIN64-NEXT: # kill: def $al killed $al killed $eax
; WIN64-NEXT: retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/copy-eflags.ll b/llvm/test/CodeGen/X86/copy-eflags.ll
index cf72a34831371..15833fa3f0c08 100644
--- a/llvm/test/CodeGen/X86/copy-eflags.ll
+++ b/llvm/test/CodeGen/X86/copy-eflags.ll
@@ -247,25 +247,25 @@ define dso_local void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, ptr %pt
;
; X64-LABEL: PR37100:
; X64: # %bb.0: # %bb
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movl {{[0-9]+}}(%rsp), %r10d
-; X64-NEXT: movzbl %cl, %r11d
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %esi
+; X64-NEXT: movzbl %cl, %ecx
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB3_1: # %bb1
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: movsbq %dil, %rax
-; X64-NEXT: xorl %ecx, %ecx
-; X64-NEXT: cmpq %rax, %rsi
-; X64-NEXT: setl %cl
-; X64-NEXT: negl %ecx
-; X64-NEXT: cmpq %rax, %rsi
+; X64-NEXT: xorl %r11d, %r11d
+; X64-NEXT: cmpq %rax, %r10
+; X64-NEXT: setl %r11b
+; X64-NEXT: negl %r11d
+; X64-NEXT: cmpq %rax, %r10
; X64-NEXT: movzbl %al, %edi
-; X64-NEXT: cmovgel %r11d, %edi
+; X64-NEXT: cmovgel %ecx, %edi
; X64-NEXT: movb %dil, (%r8)
-; X64-NEXT: cmovgel (%r9), %ecx
-; X64-NEXT: movl %r10d, %eax
+; X64-NEXT: cmovgel (%r9), %r11d
+; X64-NEXT: movl %esi, %eax
; X64-NEXT: cltd
-; X64-NEXT: idivl %ecx
+; X64-NEXT: idivl %r11d
; X64-NEXT: jmp .LBB3_1
bb:
br label %bb1
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index 123195e626997..a33319e66d5f1 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -162,27 +162,27 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
define i1 @ctpop_trunc_non_power2(i255 %x) nounwind {
; CHECK-LABEL: ctpop_trunc_non_power2:
; CHECK: # %bb.0:
-; CHECK-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rcx, %r9
-; CHECK-NEXT: andq %r8, %r9
-; CHECK-NEXT: movq %rdi, %r10
-; CHECK-NEXT: addq $-1, %r10
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: adcq $-1, %rax
+; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rcx, %r8
+; CHECK-NEXT: andq %rax, %r8
+; CHECK-NEXT: movq %rdi, %r9
+; CHECK-NEXT: addq $-1, %r9
+; CHECK-NEXT: movq %rsi, %r10
+; CHECK-NEXT: adcq $-1, %r10
; CHECK-NEXT: movq %rdx, %r11
; CHECK-NEXT: adcq $-1, %r11
-; CHECK-NEXT: adcq %r8, %rcx
-; CHECK-NEXT: andq %rdi, %r10
+; CHECK-NEXT: adcq %rax, %rcx
+; CHECK-NEXT: andq %rdi, %r9
; CHECK-NEXT: andq %rdx, %r11
-; CHECK-NEXT: orq %r10, %r11
-; CHECK-NEXT: andq %r9, %rcx
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: orq %rcx, %rax
-; CHECK-NEXT: orq %r11, %rax
+; CHECK-NEXT: orq %r9, %r11
+; CHECK-NEXT: andq %r8, %rcx
+; CHECK-NEXT: andq %rsi, %r10
+; CHECK-NEXT: orq %rcx, %r10
+; CHECK-NEXT: orq %r11, %r10
; CHECK-NEXT: sete %cl
; CHECK-NEXT: orq %rdx, %rdi
-; CHECK-NEXT: orq %rsi, %r9
-; CHECK-NEXT: orq %rdi, %r9
+; CHECK-NEXT: orq %rsi, %r8
+; CHECK-NEXT: orq %rdi, %r8
; CHECK-NEXT: setne %al
; CHECK-NEXT: andb %cl, %al
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index 443d89aaeaed8..54efe7c5160d4 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -24,38 +24,38 @@ define void @_Z1nv() local_unnamed_addr {
; CHECK-LABEL: _Z1nv:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq k at GOTPCREL(%rip), %rax
-; CHECK-NEXT: movl 4(%rax), %r11d
+; CHECK-NEXT: movl 4(%rax), %edx
; CHECK-NEXT: movq c at GOTPCREL(%rip), %rax
-; CHECK-NEXT: movswl (%rax), %r10d
-; CHECK-NEXT: movq b at GOTPCREL(%rip), %r8
-; CHECK-NEXT: movswl (%r8), %r9d
+; CHECK-NEXT: movswl (%rax), %ecx
+; CHECK-NEXT: movq b at GOTPCREL(%rip), %rax
+; CHECK-NEXT: movswl (%rax), %edi
; CHECK-NEXT: movq a at GOTPCREL(%rip), %rsi
; CHECK-NEXT: movl (%rsi), %esi
-; CHECK-NEXT: movq l at GOTPCREL(%rip), %rax
-; CHECK-NEXT: movl (%rax), %edi
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: shll $7, %eax
-; CHECK-NEXT: sarl $7, %eax
-; CHECK-NEXT: negl %eax
+; CHECK-NEXT: movq l at GOTPCREL(%rip), %r8
+; CHECK-NEXT: movl (%r8), %r8d
+; CHECK-NEXT: movl %r8d, %r9d
+; CHECK-NEXT: shll $7, %r9d
+; CHECK-NEXT: sarl $7, %r9d
+; CHECK-NEXT: negl %r9d
; CHECK-NEXT: testl %esi, %esi
-; CHECK-NEXT: cmovel %esi, %eax
-; CHECK-NEXT: movzwl %r11w, %ecx
-; CHECK-NEXT: leal (%r10,%rcx,2), %ecx
-; CHECK-NEXT: addl %r9d, %ecx
-; CHECK-NEXT: cmpl %eax, %ecx
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: testl $33554431, %edi # imm = 0x1FFFFFF
-; CHECK-NEXT: sete %dl
-; CHECK-NEXT: orb %al, %dl
-; CHECK-NEXT: movzbl %dl, %eax
-; CHECK-NEXT: movq e at GOTPCREL(%rip), %rdx
-; CHECK-NEXT: movw %ax, (%rdx)
+; CHECK-NEXT: cmovel %esi, %r9d
+; CHECK-NEXT: movzwl %dx, %r10d
+; CHECK-NEXT: leal (%rcx,%r10,2), %ecx
+; CHECK-NEXT: addl %edi, %ecx
+; CHECK-NEXT: cmpl %r9d, %ecx
+; CHECK-NEXT: sete %dil
+; CHECK-NEXT: testl $33554431, %r8d # imm = 0x1FFFFFF
+; CHECK-NEXT: sete %r8b
+; CHECK-NEXT: orb %dil, %r8b
+; CHECK-NEXT: movzbl %r8b, %edi
+; CHECK-NEXT: movq e at GOTPCREL(%rip), %r8
+; CHECK-NEXT: movw %di, (%r8)
; CHECK-NEXT: notl %ecx
; CHECK-NEXT: shrl $31, %ecx
-; CHECK-NEXT: addl %r11d, %ecx
+; CHECK-NEXT: addl %edx, %ecx
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NEXT: sarl %cl, %esi
-; CHECK-NEXT: movw %si, (%r8)
+; CHECK-NEXT: movw %si, (%rax)
; CHECK-NEXT: retq
entry:
%bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.m, %struct.m* @k, i64 0, i32 0, i32 1) to i32*), align 4
@@ -115,115 +115,115 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq x1 at GOTPCREL(%rip), %rax
-; CHECK-NEXT: movl (%rax), %edx
-; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: andl $511, %eax # imm = 0x1FF
-; CHECK-NEXT: leaq 1(%rax), %rsi
+; CHECK-NEXT: movl (%rax), %ebx
+; CHECK-NEXT: movl %ebx, %r9d
+; CHECK-NEXT: andl $511, %r9d # imm = 0x1FF
+; CHECK-NEXT: leaq 1(%r9), %rax
; CHECK-NEXT: movq x4 at GOTPCREL(%rip), %rcx
-; CHECK-NEXT: movl %esi, (%rcx)
+; CHECK-NEXT: movl %eax, (%rcx)
; CHECK-NEXT: movq x3 at GOTPCREL(%rip), %rcx
; CHECK-NEXT: movl (%rcx), %ecx
; CHECK-NEXT: testl %ecx, %ecx
; CHECK-NEXT: je .LBB1_18
; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph
-; CHECK-NEXT: movq x5 at GOTPCREL(%rip), %rdi
-; CHECK-NEXT: movq (%rdi), %r12
-; CHECK-NEXT: movl %ecx, %edi
-; CHECK-NEXT: notl %edi
-; CHECK-NEXT: leaq 8(,%rdi,8), %r14
-; CHECK-NEXT: imulq %rsi, %r14
-; CHECK-NEXT: addq %r12, %r14
-; CHECK-NEXT: movq x2 at GOTPCREL(%rip), %r15
-; CHECK-NEXT: movl (%r15), %ebx
-; CHECK-NEXT: leal 8(,%rax,8), %eax
+; CHECK-NEXT: movq x5 at GOTPCREL(%rip), %rdx
+; CHECK-NEXT: movq (%rdx), %rsi
+; CHECK-NEXT: movl %ecx, %edx
+; CHECK-NEXT: notl %edx
+; CHECK-NEXT: leaq 8(,%rdx,8), %rdi
+; CHECK-NEXT: imulq %rax, %rdi
+; CHECK-NEXT: addq %rsi, %rdi
+; CHECK-NEXT: movq x2 at GOTPCREL(%rip), %r8
+; CHECK-NEXT: movl (%r8), %edx
+; CHECK-NEXT: leal 8(,%r9,8), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: leaq 8(%r12), %rax
+; CHECK-NEXT: leaq 8(%rsi), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: leaq 32(%r12), %rax
-; CHECK-NEXT: andl $511, %edx # imm = 0x1FF
-; CHECK-NEXT: leaq 8(,%rdx,8), %r13
-; CHECK-NEXT: xorl %edi, %edi
-; CHECK-NEXT: movq x0 at GOTPCREL(%rip), %rdx
-; CHECK-NEXT: movq %r12, %rsi
+; CHECK-NEXT: leaq 32(%rsi), %r11
+; CHECK-NEXT: andl $511, %ebx # imm = 0x1FF
+; CHECK-NEXT: leaq 8(,%rbx,8), %rbx
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: movq x0 at GOTPCREL(%rip), %r15
+; CHECK-NEXT: movq %rsi, %r12
; CHECK-NEXT: jmp .LBB1_2
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_15: # %for.cond1.for.inc3_crit_edge
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: movl %ebx, (%r15)
+; CHECK-NEXT: movl %edx, (%r8)
; CHECK-NEXT: .LBB1_16: # %for.inc3
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: addq %r13, %rsi
-; CHECK-NEXT: incq %rdi
-; CHECK-NEXT: addq %r13, %rax
+; CHECK-NEXT: addq %rbx, %r12
+; CHECK-NEXT: incq %r14
+; CHECK-NEXT: addq %rbx, %r11
; CHECK-NEXT: incl %ecx
; CHECK-NEXT: je .LBB1_17
; CHECK-NEXT: .LBB1_2: # %for.cond1thread-pre-split
; CHECK-NEXT: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB1_12 Depth 2
; CHECK-NEXT: # Child Loop BB1_14 Depth 2
-; CHECK-NEXT: testl %ebx, %ebx
+; CHECK-NEXT: testl %edx, %edx
; CHECK-NEXT: jns .LBB1_16
; CHECK-NEXT: # %bb.3: # %for.body2.preheader
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: movslq %ebx, %r9
-; CHECK-NEXT: testq %r9, %r9
+; CHECK-NEXT: movslq %edx, %r13
+; CHECK-NEXT: testq %r13, %r13
; CHECK-NEXT: movq $-1, %rbp
-; CHECK-NEXT: cmovnsq %r9, %rbp
-; CHECK-NEXT: subq %r9, %rbp
+; CHECK-NEXT: cmovnsq %r13, %rbp
+; CHECK-NEXT: subq %r13, %rbp
; CHECK-NEXT: incq %rbp
; CHECK-NEXT: cmpq $4, %rbp
; CHECK-NEXT: jb .LBB1_14
; CHECK-NEXT: # %bb.4: # %min.iters.checked
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: movq %rbp, %r8
-; CHECK-NEXT: andq $-4, %r8
+; CHECK-NEXT: movq %rbp, %rdx
+; CHECK-NEXT: andq $-4, %rdx
; CHECK-NEXT: je .LBB1_14
; CHECK-NEXT: # %bb.5: # %vector.memcheck
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; CHECK-NEXT: imulq %rdi, %r11
-; CHECK-NEXT: leaq (%r12,%r11), %rbx
-; CHECK-NEXT: leaq (%rbx,%r9,8), %rbx
-; CHECK-NEXT: testq %r9, %r9
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: imulq %r14, %rax
+; CHECK-NEXT: leaq (%rsi,%rax), %r10
+; CHECK-NEXT: leaq (%r10,%r13,8), %r9
+; CHECK-NEXT: testq %r13, %r13
; CHECK-NEXT: movq $-1, %r10
-; CHECK-NEXT: cmovnsq %r9, %r10
-; CHECK-NEXT: cmpq %rdx, %rbx
+; CHECK-NEXT: cmovnsq %r13, %r10
+; CHECK-NEXT: cmpq %r15, %r9
; CHECK-NEXT: jae .LBB1_7
; CHECK-NEXT: # %bb.6: # %vector.memcheck
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; CHECK-NEXT: leaq (%r11,%r10,8), %rbx
-; CHECK-NEXT: cmpq %rdx, %rbx
+; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; CHECK-NEXT: leaq (%rax,%r10,8), %rax
+; CHECK-NEXT: cmpq %r15, %rax
; CHECK-NEXT: ja .LBB1_14
; CHECK-NEXT: .LBB1_7: # %vector.body.preheader
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: leaq -4(%r8), %rbx
-; CHECK-NEXT: movq %rbx, %r11
-; CHECK-NEXT: shrq $2, %r11
-; CHECK-NEXT: btl $2, %ebx
+; CHECK-NEXT: leaq -4(%rdx), %r9
+; CHECK-NEXT: movq %r9, %rax
+; CHECK-NEXT: shrq $2, %rax
+; CHECK-NEXT: btl $2, %r9d
; CHECK-NEXT: jb .LBB1_8
; CHECK-NEXT: # %bb.9: # %vector.body.prol.preheader
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT: movdqu %xmm0, (%rsi,%r9,8)
-; CHECK-NEXT: movdqu %xmm0, 16(%rsi,%r9,8)
+; CHECK-NEXT: movdqu %xmm0, (%r12,%r13,8)
+; CHECK-NEXT: movdqu %xmm0, 16(%r12,%r13,8)
; CHECK-NEXT: movl $4, %r10d
-; CHECK-NEXT: testq %r11, %r11
+; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: jne .LBB1_11
; CHECK-NEXT: jmp .LBB1_13
; CHECK-NEXT: .LBB1_8: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: xorl %r10d, %r10d
-; CHECK-NEXT: testq %r11, %r11
+; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: je .LBB1_13
; CHECK-NEXT: .LBB1_11: # %vector.body.preheader.new
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT: movq %r10, %rbx
-; CHECK-NEXT: subq %r8, %rbx
-; CHECK-NEXT: addq %r9, %r10
-; CHECK-NEXT: leaq (%rax,%r10,8), %r10
+; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: subq %rdx, %rax
+; CHECK-NEXT: addq %r13, %r10
+; CHECK-NEXT: leaq (%r11,%r10,8), %r10
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_12: # %vector.body
; CHECK-NEXT: # Parent Loop BB1_2 Depth=1
@@ -233,28 +233,28 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: movdqu %xmm0, (%r10)
; CHECK-NEXT: movdqu %xmm0, 16(%r10)
; CHECK-NEXT: addq $64, %r10
-; CHECK-NEXT: addq $8, %rbx
+; CHECK-NEXT: addq $8, %rax
; CHECK-NEXT: jne .LBB1_12
; CHECK-NEXT: .LBB1_13: # %middle.block
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: addq %r8, %r9
-; CHECK-NEXT: cmpq %r8, %rbp
-; CHECK-NEXT: movq %r9, %rbx
+; CHECK-NEXT: addq %rdx, %r13
+; CHECK-NEXT: cmpq %rdx, %rbp
+; CHECK-NEXT: movq %r13, %rdx
; CHECK-NEXT: je .LBB1_15
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_14: # %for.body2
; CHECK-NEXT: # Parent Loop BB1_2 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-NEXT: movq (%rdx), %rbp
-; CHECK-NEXT: movq %rbp, (%rsi,%r9,8)
-; CHECK-NEXT: leaq 1(%r9), %rbx
-; CHECK-NEXT: cmpq $-1, %r9
-; CHECK-NEXT: movq %rbx, %r9
+; CHECK-NEXT: movq (%r15), %rax
+; CHECK-NEXT: movq %rax, (%r12,%r13,8)
+; CHECK-NEXT: leaq 1(%r13), %rdx
+; CHECK-NEXT: cmpq $-1, %r13
+; CHECK-NEXT: movq %rdx, %r13
; CHECK-NEXT: jl .LBB1_14
; CHECK-NEXT: jmp .LBB1_15
; CHECK-NEXT: .LBB1_17: # %for.cond.for.end5_crit_edge
; CHECK-NEXT: movq x5 at GOTPCREL(%rip), %rax
-; CHECK-NEXT: movq %r14, (%rax)
+; CHECK-NEXT: movq %rdi, (%rax)
; CHECK-NEXT: movq x3 at GOTPCREL(%rip), %rax
; CHECK-NEXT: movl $0, (%rax)
; CHECK-NEXT: .LBB1_18: # %for.end5
diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll
index ec73b64b993b9..0efb371c274fe 100644
--- a/llvm/test/CodeGen/X86/dagcombine-cse.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll
@@ -114,20 +114,20 @@ define i96 @square_high(i96 %x) nounwind {
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %r8
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: addq %rsi, %rdx
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: addq %r8, %rdx
+; X64-NEXT: movq %rsi, %rax
; X64-NEXT: adcq $0, %rax
-; X64-NEXT: addq %rdx, %rsi
-; X64-NEXT: adcq %r8, %rax
+; X64-NEXT: addq %rdx, %r8
+; X64-NEXT: adcq %rsi, %rax
; X64-NEXT: imulq %rcx, %rcx
; X64-NEXT: addq %rax, %rcx
-; X64-NEXT: shrdq $32, %rcx, %rsi
+; X64-NEXT: shrdq $32, %rcx, %r8
; X64-NEXT: shrq $32, %rcx
-; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: movq %r8, %rax
; X64-NEXT: movq %rcx, %rdx
; X64-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 914a2f1032398..587d7929e5848 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -181,18 +181,18 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %r8, %r14
-; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: movq %r8, %rbx
+; X64-NEXT: movq %rcx, %r14
; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rsi, %r12
; X64-NEXT: movq %rdi, %r13
; X64-NEXT: callq __divti3 at PLT
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rdx, 8(%r14)
-; X64-NEXT: movq %rax, (%r14)
-; X64-NEXT: imulq %rax, %rbx
+; X64-NEXT: movq %rdx, 8(%rbx)
+; X64-NEXT: movq %rax, (%rbx)
+; X64-NEXT: imulq %rax, %r14
; X64-NEXT: mulq %r15
-; X64-NEXT: addq %rbx, %rdx
+; X64-NEXT: addq %r14, %rdx
; X64-NEXT: imulq %r15, %rcx
; X64-NEXT: addq %rdx, %rcx
; X64-NEXT: subq %rax, %r13
@@ -343,40 +343,40 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r8d
+; X64-NEXT: movzbl %al, %edi
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r9d
+; X64-NEXT: movzbl %al, %esi
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r10d
+; X64-NEXT: movzbl %al, %r8d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r11d
+; X64-NEXT: movzbl %al, %r9d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r14d
+; X64-NEXT: movzbl %al, %r10d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r15d
+; X64-NEXT: movzbl %al, %r11d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r12d
+; X64-NEXT: movzbl %al, %ebx
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r13d
+; X64-NEXT: movzbl %al, %ebp
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %edi
+; X64-NEXT: movzbl %al, %r14d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %esi
+; X64-NEXT: movzbl %al, %r15d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebx
+; X64-NEXT: movzbl %al, %r12d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebp
+; X64-NEXT: movzbl %al, %r13d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl %al, %edx
@@ -385,26 +385,26 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movd %r8d, %xmm3
-; X64-NEXT: movd %r9d, %xmm4
-; X64-NEXT: movd %r10d, %xmm5
-; X64-NEXT: movd %r11d, %xmm6
+; X64-NEXT: movd %edi, %xmm3
+; X64-NEXT: movd %esi, %xmm4
+; X64-NEXT: movd %r8d, %xmm5
+; X64-NEXT: movd %r9d, %xmm6
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-NEXT: movd %r14d, %xmm2
+; X64-NEXT: movd %r10d, %xmm2
; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; X64-NEXT: movd %r15d, %xmm4
+; X64-NEXT: movd %r11d, %xmm4
; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; X64-NEXT: movd %r12d, %xmm3
+; X64-NEXT: movd %ebx, %xmm3
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; X64-NEXT: movd %r13d, %xmm6
+; X64-NEXT: movd %ebp, %xmm6
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; X64-NEXT: movd %edi, %xmm4
+; X64-NEXT: movd %r14d, %xmm4
; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X64-NEXT: movd %esi, %xmm2
+; X64-NEXT: movd %r15d, %xmm2
; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; X64-NEXT: movd %ebx, %xmm5
+; X64-NEXT: movd %r12d, %xmm5
; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; X64-NEXT: movd %ebp, %xmm6
+; X64-NEXT: movd %r13d, %xmm6
; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
; X64-NEXT: movd %edx, %xmm2
; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 67650ec1a6e3f..6f67a12f67938 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -181,18 +181,18 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %r8, %r14
-; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: movq %r8, %rbx
+; X64-NEXT: movq %rcx, %r14
; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rsi, %r12
; X64-NEXT: movq %rdi, %r13
; X64-NEXT: callq __udivti3 at PLT
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rdx, 8(%r14)
-; X64-NEXT: movq %rax, (%r14)
-; X64-NEXT: imulq %rax, %rbx
+; X64-NEXT: movq %rdx, 8(%rbx)
+; X64-NEXT: movq %rax, (%rbx)
+; X64-NEXT: imulq %rax, %r14
; X64-NEXT: mulq %r15
-; X64-NEXT: addq %rbx, %rdx
+; X64-NEXT: addq %r14, %rdx
; X64-NEXT: imulq %r15, %rcx
; X64-NEXT: addq %rdx, %rcx
; X64-NEXT: subq %rax, %r13
@@ -343,40 +343,40 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r8d
+; X64-NEXT: movzbl %al, %edi
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r9d
+; X64-NEXT: movzbl %al, %esi
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r10d
+; X64-NEXT: movzbl %al, %r8d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r11d
+; X64-NEXT: movzbl %al, %r9d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r14d
+; X64-NEXT: movzbl %al, %r10d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r15d
+; X64-NEXT: movzbl %al, %r11d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r12d
+; X64-NEXT: movzbl %al, %ebx
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r13d
+; X64-NEXT: movzbl %al, %ebp
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %edi
+; X64-NEXT: movzbl %al, %r14d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %esi
+; X64-NEXT: movzbl %al, %r15d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebx
+; X64-NEXT: movzbl %al, %r12d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebp
+; X64-NEXT: movzbl %al, %r13d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl %al, %edx
@@ -385,26 +385,26 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movd %r8d, %xmm3
-; X64-NEXT: movd %r9d, %xmm4
-; X64-NEXT: movd %r10d, %xmm5
-; X64-NEXT: movd %r11d, %xmm6
+; X64-NEXT: movd %edi, %xmm3
+; X64-NEXT: movd %esi, %xmm4
+; X64-NEXT: movd %r8d, %xmm5
+; X64-NEXT: movd %r9d, %xmm6
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-NEXT: movd %r14d, %xmm2
+; X64-NEXT: movd %r10d, %xmm2
; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; X64-NEXT: movd %r15d, %xmm4
+; X64-NEXT: movd %r11d, %xmm4
; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; X64-NEXT: movd %r12d, %xmm3
+; X64-NEXT: movd %ebx, %xmm3
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; X64-NEXT: movd %r13d, %xmm6
+; X64-NEXT: movd %ebp, %xmm6
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; X64-NEXT: movd %edi, %xmm4
+; X64-NEXT: movd %r14d, %xmm4
; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X64-NEXT: movd %esi, %xmm2
+; X64-NEXT: movd %r15d, %xmm2
; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; X64-NEXT: movd %ebx, %xmm5
+; X64-NEXT: movd %r12d, %xmm5
; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; X64-NEXT: movd %ebp, %xmm6
+; X64-NEXT: movd %r13d, %xmm6
; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
; X64-NEXT: movd %edx, %xmm2
; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index be82b97eabb71..4549598ca3ed9 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -479,21 +479,21 @@ define i128 @udiv_i128_3(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: movq %rdx, %r8
; WIN64-NEXT: movq %rcx, %r9
-; WIN64-NEXT: addq %rdx, %rcx
-; WIN64-NEXT: adcq $0, %rcx
+; WIN64-NEXT: addq %rdx, %r9
+; WIN64-NEXT: adcq $0, %r9
; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
-; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r9, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: shrq %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax
-; WIN64-NEXT: subq %rax, %rcx
-; WIN64-NEXT: subq %rcx, %r9
+; WIN64-NEXT: subq %rax, %r9
+; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
-; WIN64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
-; WIN64-NEXT: imulq %r9, %rcx
-; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA
+; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: addq %rcx, %rdx
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
@@ -529,21 +529,21 @@ define i128 @udiv_i128_5(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: movq %rdx, %r8
; WIN64-NEXT: movq %rcx, %r9
-; WIN64-NEXT: addq %rdx, %rcx
-; WIN64-NEXT: adcq $0, %rcx
+; WIN64-NEXT: addq %rdx, %r9
+; WIN64-NEXT: adcq $0, %r9
; WIN64-NEXT: movabsq $-3689348814741910323, %r10 # imm = 0xCCCCCCCCCCCCCCCD
-; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r9, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: shrq $2, %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
-; WIN64-NEXT: subq %rax, %rcx
-; WIN64-NEXT: subq %rcx, %r9
+; WIN64-NEXT: subq %rax, %r9
+; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
-; WIN64-NEXT: movabsq $-3689348814741910324, %rcx # imm = 0xCCCCCCCCCCCCCCCC
-; WIN64-NEXT: imulq %r9, %rcx
-; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: movabsq $-3689348814741910324, %r9 # imm = 0xCCCCCCCCCCCCCCCC
+; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: addq %rcx, %rdx
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
@@ -567,37 +567,37 @@ define i128 @udiv_i128_15(i128 %x) nounwind {
; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
-; X86-64-NEXT: movabsq $-1229782938247303442, %r8 # imm = 0xEEEEEEEEEEEEEEEE
-; X86-64-NEXT: imulq %rdi, %r8
-; X86-64-NEXT: movabsq $-1229782938247303441, %rcx # imm = 0xEEEEEEEEEEEEEEEF
+; X86-64-NEXT: movabsq $-1229782938247303442, %rcx # imm = 0xEEEEEEEEEEEEEEEE
+; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-1229782938247303441, %r8 # imm = 0xEEEEEEEEEEEEEEEF
; X86-64-NEXT: movq %rdi, %rax
-; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: addq %r8, %rdx
-; X86-64-NEXT: imulq %rsi, %rcx
+; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
+; X86-64-NEXT: imulq %rsi, %r8
+; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-LABEL: udiv_i128_15:
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: movq %rdx, %r8
; WIN64-NEXT: movq %rcx, %r9
-; WIN64-NEXT: addq %rdx, %rcx
-; WIN64-NEXT: adcq $0, %rcx
+; WIN64-NEXT: addq %rdx, %r9
+; WIN64-NEXT: adcq $0, %r9
; WIN64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
-; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r9, %rax
; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: shrq $3, %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: leaq (%rax,%rax,2), %rax
-; WIN64-NEXT: subq %rax, %rcx
-; WIN64-NEXT: subq %rcx, %r9
+; WIN64-NEXT: subq %rax, %r9
+; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
-; WIN64-NEXT: movabsq $-1229782938247303442, %rcx # imm = 0xEEEEEEEEEEEEEEEE
-; WIN64-NEXT: imulq %r9, %rcx
+; WIN64-NEXT: movabsq $-1229782938247303442, %r9 # imm = 0xEEEEEEEEEEEEEEEE
+; WIN64-NEXT: imulq %rcx, %r9
; WIN64-NEXT: movabsq $-1229782938247303441, %r10 # imm = 0xEEEEEEEEEEEEEEEF
-; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: addq %rcx, %rdx
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
@@ -635,23 +635,23 @@ define i128 @udiv_i128_17(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: movq %rdx, %r8
; WIN64-NEXT: movq %rcx, %r9
-; WIN64-NEXT: addq %rdx, %rcx
-; WIN64-NEXT: adcq $0, %rcx
+; WIN64-NEXT: addq %rdx, %r9
+; WIN64-NEXT: adcq $0, %r9
; WIN64-NEXT: movabsq $-1085102592571150095, %r10 # imm = 0xF0F0F0F0F0F0F0F1
-; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r9, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: movq %rdx, %rax
; WIN64-NEXT: andq $-16, %rax
; WIN64-NEXT: shrq $4, %rdx
; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
-; WIN64-NEXT: subq %rcx, %r9
+; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
-; WIN64-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
-; WIN64-NEXT: imulq %r9, %rcx
-; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: movabsq $-1085102592571150096, %r9 # imm = 0xF0F0F0F0F0F0F0F0
+; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: addq %rcx, %rdx
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
@@ -677,14 +677,14 @@ define i128 @udiv_i128_255(i128 %x) nounwind {
; X86-64-NEXT: adcq %rdx, %rax
; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: sbbq $0, %rsi
-; X86-64-NEXT: movabsq $-72340172838076674, %r8 # imm = 0xFEFEFEFEFEFEFEFE
-; X86-64-NEXT: imulq %rdi, %r8
-; X86-64-NEXT: movabsq $-72340172838076673, %rcx # imm = 0xFEFEFEFEFEFEFEFF
+; X86-64-NEXT: movabsq $-72340172838076674, %rcx # imm = 0xFEFEFEFEFEFEFEFE
+; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-72340172838076673, %r8 # imm = 0xFEFEFEFEFEFEFEFF
; X86-64-NEXT: movq %rdi, %rax
-; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: addq %r8, %rdx
-; X86-64-NEXT: imulq %rsi, %rcx
+; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
+; X86-64-NEXT: imulq %rsi, %r8
+; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-LABEL: udiv_i128_255:
@@ -747,23 +747,23 @@ define i128 @udiv_i128_257(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: movq %rdx, %r8
; WIN64-NEXT: movq %rcx, %r9
-; WIN64-NEXT: addq %rdx, %rcx
-; WIN64-NEXT: adcq $0, %rcx
+; WIN64-NEXT: addq %rdx, %r9
+; WIN64-NEXT: adcq $0, %r9
; WIN64-NEXT: movabsq $-71777214294589695, %r10 # imm = 0xFF00FF00FF00FF01
-; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r9, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: movq %rdx, %rax
; WIN64-NEXT: andq $-256, %rax
; WIN64-NEXT: shrq $8, %rdx
; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
-; WIN64-NEXT: subq %rcx, %r9
+; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
-; WIN64-NEXT: movabsq $-71777214294589696, %rcx # imm = 0xFF00FF00FF00FF00
-; WIN64-NEXT: imulq %r9, %rcx
-; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: movabsq $-71777214294589696, %r9 # imm = 0xFF00FF00FF00FF00
+; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: addq %rcx, %rdx
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
@@ -789,14 +789,14 @@ define i128 @udiv_i128_65535(i128 %x) nounwind {
; X86-64-NEXT: adcq %rdx, %rax
; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: sbbq $0, %rsi
-; X86-64-NEXT: movabsq $-281479271743490, %r8 # imm = 0xFFFEFFFEFFFEFFFE
-; X86-64-NEXT: imulq %rdi, %r8
-; X86-64-NEXT: movabsq $-281479271743489, %rcx # imm = 0xFFFEFFFEFFFEFFFF
+; X86-64-NEXT: movabsq $-281479271743490, %rcx # imm = 0xFFFEFFFEFFFEFFFE
+; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-281479271743489, %r8 # imm = 0xFFFEFFFEFFFEFFFF
; X86-64-NEXT: movq %rdi, %rax
-; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: addq %r8, %rdx
-; X86-64-NEXT: imulq %rsi, %rcx
+; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
+; X86-64-NEXT: imulq %rsi, %r8
+; X86-64-NEXT: addq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-LABEL: udiv_i128_65535:
@@ -859,23 +859,23 @@ define i128 @udiv_i128_65537(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: movq %rdx, %r8
; WIN64-NEXT: movq %rcx, %r9
-; WIN64-NEXT: addq %rdx, %rcx
-; WIN64-NEXT: adcq $0, %rcx
+; WIN64-NEXT: addq %rdx, %r9
+; WIN64-NEXT: adcq $0, %r9
; WIN64-NEXT: movabsq $-281470681808895, %r10 # imm = 0xFFFF0000FFFF0001
-; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r9, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: movq %rdx, %rax
; WIN64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
; WIN64-NEXT: shrq $16, %rdx
; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
-; WIN64-NEXT: subq %rcx, %r9
+; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
-; WIN64-NEXT: movabsq $-281470681808896, %rcx # imm = 0xFFFF0000FFFF0000
-; WIN64-NEXT: imulq %r9, %rcx
-; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: movabsq $-281470681808896, %r9 # imm = 0xFFFF0000FFFF0000
+; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: addq %rcx, %rdx
+; WIN64-NEXT: addq %r9, %rdx
; WIN64-NEXT: imulq %r10, %r8
; WIN64-NEXT: addq %r8, %rdx
; WIN64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll
index 43d2ad4299b88..90b9a38c001ad 100644
--- a/llvm/test/CodeGen/X86/extract-bits.ll
+++ b/llvm/test/CodeGen/X86/extract-bits.ll
@@ -1921,58 +1921,58 @@ define i32 @bextr64_32_a1_trunc_extrause(i64 %val, i64 %numskipbits, i32 %numlow
;
; X64-NOBMI-LABEL: bextr64_32_a1_trunc_extrause:
; X64-NOBMI: # %bb.0:
-; X64-NOBMI-NEXT: pushq %rbp
+; X64-NOBMI-NEXT: pushq %r14
; X64-NOBMI-NEXT: pushq %rbx
; X64-NOBMI-NEXT: pushq %rax
-; X64-NOBMI-NEXT: movl %edx, %ebp
+; X64-NOBMI-NEXT: movl %edx, %ebx
; X64-NOBMI-NEXT: movq %rsi, %rcx
-; X64-NOBMI-NEXT: movq %rdi, %rbx
+; X64-NOBMI-NEXT: movq %rdi, %r14
; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT: shrq %cl, %rbx
-; X64-NOBMI-NEXT: movl %ebx, %edi
+; X64-NOBMI-NEXT: shrq %cl, %r14
+; X64-NOBMI-NEXT: movl %r14d, %edi
; X64-NOBMI-NEXT: callq use32 at PLT
; X64-NOBMI-NEXT: movl $1, %eax
-; X64-NOBMI-NEXT: movl %ebp, %ecx
+; X64-NOBMI-NEXT: movl %ebx, %ecx
; X64-NOBMI-NEXT: shll %cl, %eax
; X64-NOBMI-NEXT: decl %eax
-; X64-NOBMI-NEXT: andl %ebx, %eax
+; X64-NOBMI-NEXT: andl %r14d, %eax
; X64-NOBMI-NEXT: addq $8, %rsp
; X64-NOBMI-NEXT: popq %rbx
-; X64-NOBMI-NEXT: popq %rbp
+; X64-NOBMI-NEXT: popq %r14
; X64-NOBMI-NEXT: retq
;
; X64-BMI1-LABEL: bextr64_32_a1_trunc_extrause:
; X64-BMI1: # %bb.0:
-; X64-BMI1-NEXT: pushq %rbp
+; X64-BMI1-NEXT: pushq %r14
; X64-BMI1-NEXT: pushq %rbx
; X64-BMI1-NEXT: pushq %rax
-; X64-BMI1-NEXT: movl %edx, %ebp
+; X64-BMI1-NEXT: movl %edx, %ebx
; X64-BMI1-NEXT: movq %rsi, %rcx
-; X64-BMI1-NEXT: movq %rdi, %rbx
+; X64-BMI1-NEXT: movq %rdi, %r14
; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-BMI1-NEXT: shrq %cl, %rbx
-; X64-BMI1-NEXT: movl %ebx, %edi
+; X64-BMI1-NEXT: shrq %cl, %r14
+; X64-BMI1-NEXT: movl %r14d, %edi
; X64-BMI1-NEXT: callq use32 at PLT
-; X64-BMI1-NEXT: shll $8, %ebp
-; X64-BMI1-NEXT: bextrl %ebp, %ebx, %eax
+; X64-BMI1-NEXT: shll $8, %ebx
+; X64-BMI1-NEXT: bextrl %ebx, %r14d, %eax
; X64-BMI1-NEXT: addq $8, %rsp
; X64-BMI1-NEXT: popq %rbx
-; X64-BMI1-NEXT: popq %rbp
+; X64-BMI1-NEXT: popq %r14
; X64-BMI1-NEXT: retq
;
; X64-BMI2-LABEL: bextr64_32_a1_trunc_extrause:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: pushq %rbp
+; X64-BMI2-NEXT: pushq %r14
; X64-BMI2-NEXT: pushq %rbx
; X64-BMI2-NEXT: pushq %rax
-; X64-BMI2-NEXT: movl %edx, %ebp
-; X64-BMI2-NEXT: shrxq %rsi, %rdi, %rbx
-; X64-BMI2-NEXT: movl %ebx, %edi
+; X64-BMI2-NEXT: movl %edx, %ebx
+; X64-BMI2-NEXT: shrxq %rsi, %rdi, %r14
+; X64-BMI2-NEXT: movl %r14d, %edi
; X64-BMI2-NEXT: callq use32 at PLT
-; X64-BMI2-NEXT: bzhil %ebp, %ebx, %eax
+; X64-BMI2-NEXT: bzhil %ebx, %r14d, %eax
; X64-BMI2-NEXT: addq $8, %rsp
; X64-BMI2-NEXT: popq %rbx
-; X64-BMI2-NEXT: popq %rbp
+; X64-BMI2-NEXT: popq %r14
; X64-BMI2-NEXT: retq
%shifted = lshr i64 %val, %numskipbits
%truncshifted = trunc i64 %shifted to i32
@@ -4782,20 +4782,20 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
; X64-NOBMI-NEXT: pushq %rbp
; X64-NOBMI-NEXT: pushq %r14
; X64-NOBMI-NEXT: pushq %rbx
-; X64-NOBMI-NEXT: movl %esi, %r14d
+; X64-NOBMI-NEXT: movl %esi, %ebx
; X64-NOBMI-NEXT: movl %edi, %ebp
-; X64-NOBMI-NEXT: movl %r14d, %ecx
+; X64-NOBMI-NEXT: movl %ebx, %ecx
; X64-NOBMI-NEXT: shrl %cl, %ebp
; X64-NOBMI-NEXT: negb %dl
-; X64-NOBMI-NEXT: movl $-1, %ebx
+; X64-NOBMI-NEXT: movl $-1, %r14d
; X64-NOBMI-NEXT: movl %edx, %ecx
-; X64-NOBMI-NEXT: shrl %cl, %ebx
-; X64-NOBMI-NEXT: movl %ebx, %edi
-; X64-NOBMI-NEXT: callq use32 at PLT
-; X64-NOBMI-NEXT: andl %ebp, %ebx
+; X64-NOBMI-NEXT: shrl %cl, %r14d
; X64-NOBMI-NEXT: movl %r14d, %edi
; X64-NOBMI-NEXT: callq use32 at PLT
-; X64-NOBMI-NEXT: movl %ebx, %eax
+; X64-NOBMI-NEXT: andl %ebp, %r14d
+; X64-NOBMI-NEXT: movl %ebx, %edi
+; X64-NOBMI-NEXT: callq use32 at PLT
+; X64-NOBMI-NEXT: movl %r14d, %eax
; X64-NOBMI-NEXT: popq %rbx
; X64-NOBMI-NEXT: popq %r14
; X64-NOBMI-NEXT: popq %rbp
@@ -4806,20 +4806,20 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
; X64-BMI1-NEXT: pushq %rbp
; X64-BMI1-NEXT: pushq %r14
; X64-BMI1-NEXT: pushq %rbx
-; X64-BMI1-NEXT: movl %esi, %r14d
+; X64-BMI1-NEXT: movl %esi, %ebx
; X64-BMI1-NEXT: movl %edi, %ebp
-; X64-BMI1-NEXT: movl %r14d, %ecx
+; X64-BMI1-NEXT: movl %ebx, %ecx
; X64-BMI1-NEXT: shrl %cl, %ebp
; X64-BMI1-NEXT: negb %dl
-; X64-BMI1-NEXT: movl $-1, %ebx
+; X64-BMI1-NEXT: movl $-1, %r14d
; X64-BMI1-NEXT: movl %edx, %ecx
-; X64-BMI1-NEXT: shrl %cl, %ebx
-; X64-BMI1-NEXT: movl %ebx, %edi
-; X64-BMI1-NEXT: callq use32 at PLT
-; X64-BMI1-NEXT: andl %ebp, %ebx
+; X64-BMI1-NEXT: shrl %cl, %r14d
; X64-BMI1-NEXT: movl %r14d, %edi
; X64-BMI1-NEXT: callq use32 at PLT
-; X64-BMI1-NEXT: movl %ebx, %eax
+; X64-BMI1-NEXT: andl %ebp, %r14d
+; X64-BMI1-NEXT: movl %ebx, %edi
+; X64-BMI1-NEXT: callq use32 at PLT
+; X64-BMI1-NEXT: movl %r14d, %eax
; X64-BMI1-NEXT: popq %rbx
; X64-BMI1-NEXT: popq %r14
; X64-BMI1-NEXT: popq %rbp
@@ -5000,17 +5000,17 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
; X64-NOBMI-NEXT: pushq %rbx
; X64-NOBMI-NEXT: pushq %rax
; X64-NOBMI-NEXT: movq %rsi, %rcx
-; X64-NOBMI-NEXT: movq %rdi, %r14
+; X64-NOBMI-NEXT: movq %rdi, %rbx
; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT: shrq %cl, %r14
+; X64-NOBMI-NEXT: shrq %cl, %rbx
; X64-NOBMI-NEXT: negb %dl
-; X64-NOBMI-NEXT: movq $-1, %rbx
+; X64-NOBMI-NEXT: movq $-1, %r14
; X64-NOBMI-NEXT: movl %edx, %ecx
-; X64-NOBMI-NEXT: shrq %cl, %rbx
-; X64-NOBMI-NEXT: movq %rbx, %rdi
+; X64-NOBMI-NEXT: shrq %cl, %r14
+; X64-NOBMI-NEXT: movq %r14, %rdi
; X64-NOBMI-NEXT: callq use64 at PLT
-; X64-NOBMI-NEXT: andq %r14, %rbx
-; X64-NOBMI-NEXT: movq %rbx, %rax
+; X64-NOBMI-NEXT: andq %rbx, %r14
+; X64-NOBMI-NEXT: movq %r14, %rax
; X64-NOBMI-NEXT: addq $8, %rsp
; X64-NOBMI-NEXT: popq %rbx
; X64-NOBMI-NEXT: popq %r14
@@ -5022,17 +5022,17 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
; X64-BMI1-NEXT: pushq %rbx
; X64-BMI1-NEXT: pushq %rax
; X64-BMI1-NEXT: movq %rsi, %rcx
-; X64-BMI1-NEXT: movq %rdi, %r14
+; X64-BMI1-NEXT: movq %rdi, %rbx
; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-BMI1-NEXT: shrq %cl, %r14
+; X64-BMI1-NEXT: shrq %cl, %rbx
; X64-BMI1-NEXT: negb %dl
-; X64-BMI1-NEXT: movq $-1, %rbx
+; X64-BMI1-NEXT: movq $-1, %r14
; X64-BMI1-NEXT: movl %edx, %ecx
-; X64-BMI1-NEXT: shrq %cl, %rbx
-; X64-BMI1-NEXT: movq %rbx, %rdi
+; X64-BMI1-NEXT: shrq %cl, %r14
+; X64-BMI1-NEXT: movq %r14, %rdi
; X64-BMI1-NEXT: callq use64 at PLT
-; X64-BMI1-NEXT: andq %r14, %rbx
-; X64-BMI1-NEXT: movq %rbx, %rax
+; X64-BMI1-NEXT: andq %rbx, %r14
+; X64-BMI1-NEXT: movq %r14, %rax
; X64-BMI1-NEXT: addq $8, %rsp
; X64-BMI1-NEXT: popq %rbx
; X64-BMI1-NEXT: popq %r14
@@ -5206,17 +5206,17 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
; X64-NOBMI-NEXT: pushq %rbx
; X64-NOBMI-NEXT: pushq %rax
; X64-NOBMI-NEXT: movl %esi, %ecx
-; X64-NOBMI-NEXT: movq %rdi, %r14
+; X64-NOBMI-NEXT: movq %rdi, %rbx
; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT: shrq %cl, %r14
+; X64-NOBMI-NEXT: shrq %cl, %rbx
; X64-NOBMI-NEXT: negb %dl
-; X64-NOBMI-NEXT: movq $-1, %rbx
+; X64-NOBMI-NEXT: movq $-1, %r14
; X64-NOBMI-NEXT: movl %edx, %ecx
-; X64-NOBMI-NEXT: shrq %cl, %rbx
-; X64-NOBMI-NEXT: movq %rbx, %rdi
+; X64-NOBMI-NEXT: shrq %cl, %r14
+; X64-NOBMI-NEXT: movq %r14, %rdi
; X64-NOBMI-NEXT: callq use64 at PLT
-; X64-NOBMI-NEXT: andq %r14, %rbx
-; X64-NOBMI-NEXT: movq %rbx, %rax
+; X64-NOBMI-NEXT: andq %rbx, %r14
+; X64-NOBMI-NEXT: movq %r14, %rax
; X64-NOBMI-NEXT: addq $8, %rsp
; X64-NOBMI-NEXT: popq %rbx
; X64-NOBMI-NEXT: popq %r14
@@ -5228,17 +5228,17 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
; X64-BMI1-NEXT: pushq %rbx
; X64-BMI1-NEXT: pushq %rax
; X64-BMI1-NEXT: movl %esi, %ecx
-; X64-BMI1-NEXT: movq %rdi, %r14
+; X64-BMI1-NEXT: movq %rdi, %rbx
; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-BMI1-NEXT: shrq %cl, %r14
+; X64-BMI1-NEXT: shrq %cl, %rbx
; X64-BMI1-NEXT: negb %dl
-; X64-BMI1-NEXT: movq $-1, %rbx
+; X64-BMI1-NEXT: movq $-1, %r14
; X64-BMI1-NEXT: movl %edx, %ecx
-; X64-BMI1-NEXT: shrq %cl, %rbx
-; X64-BMI1-NEXT: movq %rbx, %rdi
+; X64-BMI1-NEXT: shrq %cl, %r14
+; X64-BMI1-NEXT: movq %r14, %rdi
; X64-BMI1-NEXT: callq use64 at PLT
-; X64-BMI1-NEXT: andq %r14, %rbx
-; X64-BMI1-NEXT: movq %rbx, %rax
+; X64-BMI1-NEXT: andq %rbx, %r14
+; X64-BMI1-NEXT: movq %r14, %rax
; X64-BMI1-NEXT: addq $8, %rsp
; X64-BMI1-NEXT: popq %rbx
; X64-BMI1-NEXT: popq %r14
@@ -5838,17 +5838,17 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
; X64-NOBMI-NEXT: pushq %rbx
; X64-NOBMI-NEXT: pushq %rax
; X64-NOBMI-NEXT: movq %rsi, %rcx
-; X64-NOBMI-NEXT: movq %rdi, %r14
+; X64-NOBMI-NEXT: movq %rdi, %rbx
; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT: shrq %cl, %r14
+; X64-NOBMI-NEXT: shrq %cl, %rbx
; X64-NOBMI-NEXT: negb %dl
-; X64-NOBMI-NEXT: movq $-1, %rbx
+; X64-NOBMI-NEXT: movq $-1, %r14
; X64-NOBMI-NEXT: movl %edx, %ecx
-; X64-NOBMI-NEXT: shrq %cl, %rbx
-; X64-NOBMI-NEXT: movq %rbx, %rdi
+; X64-NOBMI-NEXT: shrq %cl, %r14
+; X64-NOBMI-NEXT: movq %r14, %rdi
; X64-NOBMI-NEXT: callq use64 at PLT
-; X64-NOBMI-NEXT: andq %r14, %rbx
-; X64-NOBMI-NEXT: movq %rbx, %rax
+; X64-NOBMI-NEXT: andq %rbx, %r14
+; X64-NOBMI-NEXT: movq %r14, %rax
; X64-NOBMI-NEXT: addq $8, %rsp
; X64-NOBMI-NEXT: popq %rbx
; X64-NOBMI-NEXT: popq %r14
@@ -5860,17 +5860,17 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
; X64-BMI1-NEXT: pushq %rbx
; X64-BMI1-NEXT: pushq %rax
; X64-BMI1-NEXT: movq %rsi, %rcx
-; X64-BMI1-NEXT: movq %rdi, %r14
+; X64-BMI1-NEXT: movq %rdi, %rbx
; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-BMI1-NEXT: shrq %cl, %r14
+; X64-BMI1-NEXT: shrq %cl, %rbx
; X64-BMI1-NEXT: negb %dl
-; X64-BMI1-NEXT: movq $-1, %rbx
+; X64-BMI1-NEXT: movq $-1, %r14
; X64-BMI1-NEXT: movl %edx, %ecx
-; X64-BMI1-NEXT: shrq %cl, %rbx
-; X64-BMI1-NEXT: movq %rbx, %rdi
+; X64-BMI1-NEXT: shrq %cl, %r14
+; X64-BMI1-NEXT: movq %r14, %rdi
; X64-BMI1-NEXT: callq use64 at PLT
-; X64-BMI1-NEXT: andq %r14, %rbx
-; X64-BMI1-NEXT: movq %rbx, %rax
+; X64-BMI1-NEXT: andq %rbx, %r14
+; X64-BMI1-NEXT: movq %r14, %rax
; X64-BMI1-NEXT: addq $8, %rsp
; X64-BMI1-NEXT: popq %rbx
; X64-BMI1-NEXT: popq %r14
@@ -6058,20 +6058,20 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
; X64-NOBMI-NEXT: pushq %r15
; X64-NOBMI-NEXT: pushq %r14
; X64-NOBMI-NEXT: pushq %rbx
-; X64-NOBMI-NEXT: movq %rsi, %r14
-; X64-NOBMI-NEXT: movq %rdi, %r15
-; X64-NOBMI-NEXT: movl %r14d, %ecx
-; X64-NOBMI-NEXT: shrq %cl, %r15
+; X64-NOBMI-NEXT: movq %rsi, %rbx
+; X64-NOBMI-NEXT: movq %rdi, %r14
+; X64-NOBMI-NEXT: movl %ebx, %ecx
+; X64-NOBMI-NEXT: shrq %cl, %r14
; X64-NOBMI-NEXT: negb %dl
-; X64-NOBMI-NEXT: movq $-1, %rbx
+; X64-NOBMI-NEXT: movq $-1, %r15
; X64-NOBMI-NEXT: movl %edx, %ecx
-; X64-NOBMI-NEXT: shrq %cl, %rbx
-; X64-NOBMI-NEXT: movq %rbx, %rdi
+; X64-NOBMI-NEXT: shrq %cl, %r15
+; X64-NOBMI-NEXT: movq %r15, %rdi
; X64-NOBMI-NEXT: callq use64 at PLT
-; X64-NOBMI-NEXT: andq %r15, %rbx
-; X64-NOBMI-NEXT: movq %r14, %rdi
+; X64-NOBMI-NEXT: andq %r14, %r15
+; X64-NOBMI-NEXT: movq %rbx, %rdi
; X64-NOBMI-NEXT: callq use64 at PLT
-; X64-NOBMI-NEXT: movq %rbx, %rax
+; X64-NOBMI-NEXT: movq %r15, %rax
; X64-NOBMI-NEXT: popq %rbx
; X64-NOBMI-NEXT: popq %r14
; X64-NOBMI-NEXT: popq %r15
@@ -6082,20 +6082,20 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
; X64-BMI1-NEXT: pushq %r15
; X64-BMI1-NEXT: pushq %r14
; X64-BMI1-NEXT: pushq %rbx
-; X64-BMI1-NEXT: movq %rsi, %r14
-; X64-BMI1-NEXT: movq %rdi, %r15
-; X64-BMI1-NEXT: movl %r14d, %ecx
-; X64-BMI1-NEXT: shrq %cl, %r15
+; X64-BMI1-NEXT: movq %rsi, %rbx
+; X64-BMI1-NEXT: movq %rdi, %r14
+; X64-BMI1-NEXT: movl %ebx, %ecx
+; X64-BMI1-NEXT: shrq %cl, %r14
; X64-BMI1-NEXT: negb %dl
-; X64-BMI1-NEXT: movq $-1, %rbx
+; X64-BMI1-NEXT: movq $-1, %r15
; X64-BMI1-NEXT: movl %edx, %ecx
-; X64-BMI1-NEXT: shrq %cl, %rbx
-; X64-BMI1-NEXT: movq %rbx, %rdi
+; X64-BMI1-NEXT: shrq %cl, %r15
+; X64-BMI1-NEXT: movq %r15, %rdi
; X64-BMI1-NEXT: callq use64 at PLT
-; X64-BMI1-NEXT: andq %r15, %rbx
-; X64-BMI1-NEXT: movq %r14, %rdi
+; X64-BMI1-NEXT: andq %r14, %r15
+; X64-BMI1-NEXT: movq %rbx, %rdi
; X64-BMI1-NEXT: callq use64 at PLT
-; X64-BMI1-NEXT: movq %rbx, %rax
+; X64-BMI1-NEXT: movq %r15, %rax
; X64-BMI1-NEXT: popq %rbx
; X64-BMI1-NEXT: popq %r14
; X64-BMI1-NEXT: popq %r15
diff --git a/llvm/test/CodeGen/X86/flt-rounds.ll b/llvm/test/CodeGen/X86/flt-rounds.ll
index 971a6a82b077a..6099987d69196 100644
--- a/llvm/test/CodeGen/X86/flt-rounds.ll
+++ b/llvm/test/CodeGen/X86/flt-rounds.ll
@@ -116,14 +116,14 @@ define i32 @multiple_flt_rounds() nounwind {
; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; X64-NEXT: shrl $9, %ecx
; X64-NEXT: andb $6, %cl
-; X64-NEXT: movl $45, %r14d
+; X64-NEXT: movl $45, %ebx
; X64-NEXT: movl $45, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shrl %cl, %eax
; X64-NEXT: andl $3, %eax
-; X64-NEXT: xorl %ebx, %ebx
+; X64-NEXT: xorl %r14d, %r14d
; X64-NEXT: cmpl $3, %eax
-; X64-NEXT: setne %bl
+; X64-NEXT: setne %r14b
; X64-NEXT: xorl %edi, %edi
; X64-NEXT: callq fesetround
; X64-NEXT: fnstcw {{[0-9]+}}(%rsp)
@@ -134,9 +134,9 @@ define i32 @multiple_flt_rounds() nounwind {
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shrl %cl, %eax
; X64-NEXT: andl $3, %eax
-; X64-NEXT: leal 1(%rbx), %ebp
+; X64-NEXT: leal 1(%r14), %ebp
; X64-NEXT: cmpl $1, %eax
-; X64-NEXT: cmovel %ebx, %ebp
+; X64-NEXT: cmovel %r14d, %ebp
; X64-NEXT: movl $3072, %edi # imm = 0xC00
; X64-NEXT: callq fesetround
; X64-NEXT: fnstcw {{[0-9]+}}(%rsp)
@@ -156,10 +156,10 @@ define i32 @multiple_flt_rounds() nounwind {
; X64-NEXT: shrl $9, %ecx
; X64-NEXT: andb $6, %cl
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: shrl %cl, %r14d
-; X64-NEXT: andl $3, %r14d
+; X64-NEXT: shrl %cl, %ebx
+; X64-NEXT: andl $3, %ebx
; X64-NEXT: xorl %ecx, %ecx
-; X64-NEXT: cmpl $2, %r14d
+; X64-NEXT: cmpl $2, %ebx
; X64-NEXT: setne %cl
; X64-NEXT: negl %ecx
; X64-NEXT: xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/fma-commute-loop.ll b/llvm/test/CodeGen/X86/fma-commute-loop.ll
index f998c45077586..833137fa6cd6d 100644
--- a/llvm/test/CodeGen/X86/fma-commute-loop.ll
+++ b/llvm/test/CodeGen/X86/fma-commute-loop.ll
@@ -12,17 +12,17 @@ define void @eggs(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; CHECK-NEXT: leaq (%rbx,%r14,8), %r14
-; CHECK-NEXT: leaq (%rbx,%r15,8), %r15
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; CHECK-NEXT: leaq (%r12,%r14,8), %r14
+; CHECK-NEXT: leaq (%r12,%r15,8), %r15
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: xorl %r12d, %r12d
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; CHECK-NEXT: addq %r12, %r13
-; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r12
+; CHECK-NEXT: addq %rbx, %r13
+; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rbx
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
@@ -33,17 +33,17 @@ define void @eggs(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmovupd (%rax,%r11,8), %zmm6
; CHECK-NEXT: vmovupd (%rax,%r13,8), %zmm7
-; CHECK-NEXT: vmovupd (%rax,%r12,8), %zmm8
-; CHECK-NEXT: vbroadcastsd (%r15,%rbx,8), %zmm9
+; CHECK-NEXT: vmovupd (%rax,%rbx,8), %zmm8
+; CHECK-NEXT: vbroadcastsd (%r15,%r12,8), %zmm9
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm6 * zmm9) + zmm0
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm7 * zmm9) + zmm1
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm8 * zmm9) + zmm2
-; CHECK-NEXT: vbroadcastsd (%r14,%rbx,8), %zmm9
+; CHECK-NEXT: vbroadcastsd (%r14,%r12,8), %zmm9
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm9 * zmm6) + zmm3
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm4 = (zmm9 * zmm7) + zmm4
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm5 = (zmm8 * zmm9) + zmm5
-; CHECK-NEXT: incq %rbx
-; CHECK-NEXT: cmpq %rbx, %r10
+; CHECK-NEXT: incq %r12
+; CHECK-NEXT: cmpq %r12, %r10
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: ## %bb.2: ## %bb51
; CHECK-NEXT: vmovapd %zmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/fmaddsub-combine.ll b/llvm/test/CodeGen/X86/fmaddsub-combine.ll
index 7e5916025eca0..427d62f5a28f5 100644
--- a/llvm/test/CodeGen/X86/fmaddsub-combine.ll
+++ b/llvm/test/CodeGen/X86/fmaddsub-combine.ll
@@ -569,14 +569,14 @@ define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D,
; NOFMA-NEXT: vaddss %xmm4, %xmm3, %xmm3
; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm4
; NOFMA-NEXT: vextractf128 $1, %ymm2, %xmm5
-; NOFMA-NEXT: vaddss %xmm5, %xmm4, %xmm8
+; NOFMA-NEXT: vaddss %xmm5, %xmm4, %xmm6
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; NOFMA-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
-; NOFMA-NEXT: vaddss %xmm6, %xmm7, %xmm9
-; NOFMA-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; NOFMA-NEXT: vmovshdup {{.*#+}} xmm6 = xmm2[1,1,3,3]
-; NOFMA-NEXT: vsubss %xmm6, %xmm7, %xmm6
-; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3]
+; NOFMA-NEXT: vpermilpd {{.*#+}} xmm8 = xmm5[1,0]
+; NOFMA-NEXT: vaddss %xmm7, %xmm8, %xmm7
+; NOFMA-NEXT: vmovshdup {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; NOFMA-NEXT: vmovshdup {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; NOFMA-NEXT: vsubss %xmm9, %xmm8, %xmm8
+; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[2,3]
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
@@ -585,8 +585,8 @@ define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D,
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm2, %xmm1, %xmm1
-; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[2,3]
-; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3]
+; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[2,3]
+; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2
@@ -694,31 +694,31 @@ define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float>
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1
; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; NOFMA-NEXT: vaddss %xmm4, %xmm0, %xmm8
+; NOFMA-NEXT: vaddss %xmm4, %xmm0, %xmm2
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm6 = xmm4[1,0]
-; NOFMA-NEXT: vaddss %xmm6, %xmm3, %xmm9
+; NOFMA-NEXT: vaddss %xmm6, %xmm3, %xmm3
; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm6
; NOFMA-NEXT: vextractf128 $1, %ymm4, %xmm7
-; NOFMA-NEXT: vaddss %xmm7, %xmm6, %xmm10
-; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm6[1,0]
-; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm7[1,0]
-; NOFMA-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; NOFMA-NEXT: vinsertps {{.*#+}} xmm11 = xmm10[0,1],xmm2[0],xmm10[3]
-; NOFMA-NEXT: vaddss %xmm5, %xmm1, %xmm10
-; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm5[1,0]
-; NOFMA-NEXT: vaddss %xmm2, %xmm3, %xmm12
-; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm14
-; NOFMA-NEXT: vpermilpd {{.*#+}} xmm13 = xmm14[1,0]
-; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm15
-; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm15[1,0]
-; NOFMA-NEXT: vaddss %xmm3, %xmm13, %xmm13
-; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; NOFMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; NOFMA-NEXT: vsubss %xmm2, %xmm3, %xmm2
-; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[2,3]
-; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; NOFMA-NEXT: vaddss %xmm7, %xmm6, %xmm8
+; NOFMA-NEXT: vpermilpd {{.*#+}} xmm9 = xmm6[1,0]
+; NOFMA-NEXT: vpermilpd {{.*#+}} xmm10 = xmm7[1,0]
+; NOFMA-NEXT: vaddss %xmm10, %xmm9, %xmm9
+; NOFMA-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1],xmm9[0],xmm8[3]
+; NOFMA-NEXT: vaddss %xmm5, %xmm1, %xmm9
+; NOFMA-NEXT: vpermilpd {{.*#+}} xmm10 = xmm1[1,0]
+; NOFMA-NEXT: vpermilpd {{.*#+}} xmm11 = xmm5[1,0]
+; NOFMA-NEXT: vaddss %xmm11, %xmm10, %xmm10
+; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm11
+; NOFMA-NEXT: vpermilpd {{.*#+}} xmm12 = xmm11[1,0]
+; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm13
+; NOFMA-NEXT: vpermilpd {{.*#+}} xmm14 = xmm13[1,0]
+; NOFMA-NEXT: vaddss %xmm14, %xmm12, %xmm12
+; NOFMA-NEXT: vmovshdup {{.*#+}} xmm14 = xmm0[1,1,3,3]
+; NOFMA-NEXT: vmovshdup {{.*#+}} xmm15 = xmm4[1,1,3,3]
+; NOFMA-NEXT: vsubss %xmm15, %xmm14, %xmm14
+; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[2,3]
+; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm3, %xmm0, %xmm0
@@ -726,22 +726,22 @@ define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float>
; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm6[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm7[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2
-; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[0]
+; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[0]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm5[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3
-; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[2,3]
-; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm12[0],xmm3[3]
+; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[2,3]
+; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm10[0],xmm3[3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm5[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm4, %xmm1, %xmm1
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
-; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm14[1,1,3,3]
-; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm15[1,1,3,3]
+; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm11[1,1,3,3]
+; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm13[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3
-; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm13[0,0]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm14[3,3,3,3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm5 = xmm15[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm12[0,0]
+; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[3,3,3,3]
+; NOFMA-NEXT: vpermilps {{.*#+}} xmm5 = xmm13[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4
; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -841,11 +841,11 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double>
; NOFMA-NEXT: vaddsd %xmm4, %xmm0, %xmm2
; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm3
; NOFMA-NEXT: vextractf128 $1, %ymm4, %xmm6
-; NOFMA-NEXT: vaddsd %xmm6, %xmm3, %xmm9
+; NOFMA-NEXT: vaddsd %xmm6, %xmm3, %xmm7
; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm8
; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm1
; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm5
-; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm7
+; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm9
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
; NOFMA-NEXT: vsubsd %xmm4, %xmm0, %xmm0
@@ -853,11 +853,11 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double>
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm6[1,0]
; NOFMA-NEXT: vsubsd %xmm3, %xmm2, %xmm2
-; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm9[0],xmm2[0]
+; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm7[0],xmm2[0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm5[1,0]
; NOFMA-NEXT: vsubsd %xmm3, %xmm1, %xmm1
-; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm7[0],xmm1[0]
+; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm9[0],xmm1[0]
; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
; NOFMA-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/fmaxnum.ll b/llvm/test/CodeGen/X86/fmaxnum.ll
index 2160624d5de33..2e1af1e84e076 100644
--- a/llvm/test/CodeGen/X86/fmaxnum.ll
+++ b/llvm/test/CodeGen/X86/fmaxnum.ll
@@ -322,29 +322,29 @@ define <16 x float> @test_intrinsic_fmax_v16f32(<16 x float> %x, <16 x float> %y
;
; SSE4-LABEL: test_intrinsic_fmax_v16f32:
; SSE4: # %bb.0:
-; SSE4-NEXT: movaps %xmm3, %xmm8
-; SSE4-NEXT: movaps %xmm2, %xmm9
-; SSE4-NEXT: movaps %xmm1, %xmm2
-; SSE4-NEXT: movaps %xmm4, %xmm10
-; SSE4-NEXT: maxps %xmm0, %xmm10
+; SSE4-NEXT: movaps %xmm3, %xmm11
+; SSE4-NEXT: movaps %xmm2, %xmm10
+; SSE4-NEXT: movaps %xmm1, %xmm9
+; SSE4-NEXT: movaps %xmm4, %xmm8
+; SSE4-NEXT: maxps %xmm0, %xmm8
; SSE4-NEXT: cmpunordps %xmm0, %xmm0
-; SSE4-NEXT: blendvps %xmm0, %xmm4, %xmm10
+; SSE4-NEXT: blendvps %xmm0, %xmm4, %xmm8
; SSE4-NEXT: movaps %xmm5, %xmm1
-; SSE4-NEXT: maxps %xmm2, %xmm1
-; SSE4-NEXT: cmpunordps %xmm2, %xmm2
-; SSE4-NEXT: movaps %xmm2, %xmm0
-; SSE4-NEXT: blendvps %xmm0, %xmm5, %xmm1
-; SSE4-NEXT: movaps %xmm6, %xmm2
-; SSE4-NEXT: maxps %xmm9, %xmm2
+; SSE4-NEXT: maxps %xmm9, %xmm1
; SSE4-NEXT: cmpunordps %xmm9, %xmm9
; SSE4-NEXT: movaps %xmm9, %xmm0
+; SSE4-NEXT: blendvps %xmm0, %xmm5, %xmm1
+; SSE4-NEXT: movaps %xmm6, %xmm2
+; SSE4-NEXT: maxps %xmm10, %xmm2
+; SSE4-NEXT: cmpunordps %xmm10, %xmm10
+; SSE4-NEXT: movaps %xmm10, %xmm0
; SSE4-NEXT: blendvps %xmm0, %xmm6, %xmm2
; SSE4-NEXT: movaps %xmm7, %xmm3
-; SSE4-NEXT: maxps %xmm8, %xmm3
-; SSE4-NEXT: cmpunordps %xmm8, %xmm8
-; SSE4-NEXT: movaps %xmm8, %xmm0
+; SSE4-NEXT: maxps %xmm11, %xmm3
+; SSE4-NEXT: cmpunordps %xmm11, %xmm11
+; SSE4-NEXT: movaps %xmm11, %xmm0
; SSE4-NEXT: blendvps %xmm0, %xmm7, %xmm3
-; SSE4-NEXT: movaps %xmm10, %xmm0
+; SSE4-NEXT: movaps %xmm8, %xmm0
; SSE4-NEXT: retq
;
; AVX1-LABEL: test_intrinsic_fmax_v16f32:
@@ -471,29 +471,29 @@ define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y)
;
; SSE4-LABEL: test_intrinsic_fmax_v8f64:
; SSE4: # %bb.0:
-; SSE4-NEXT: movapd %xmm3, %xmm8
-; SSE4-NEXT: movapd %xmm2, %xmm9
-; SSE4-NEXT: movapd %xmm1, %xmm2
-; SSE4-NEXT: movapd %xmm4, %xmm10
-; SSE4-NEXT: maxpd %xmm0, %xmm10
+; SSE4-NEXT: movapd %xmm3, %xmm11
+; SSE4-NEXT: movapd %xmm2, %xmm10
+; SSE4-NEXT: movapd %xmm1, %xmm9
+; SSE4-NEXT: movapd %xmm4, %xmm8
+; SSE4-NEXT: maxpd %xmm0, %xmm8
; SSE4-NEXT: cmpunordpd %xmm0, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm10
+; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm8
; SSE4-NEXT: movapd %xmm5, %xmm1
-; SSE4-NEXT: maxpd %xmm2, %xmm1
-; SSE4-NEXT: cmpunordpd %xmm2, %xmm2
-; SSE4-NEXT: movapd %xmm2, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1
-; SSE4-NEXT: movapd %xmm6, %xmm2
-; SSE4-NEXT: maxpd %xmm9, %xmm2
+; SSE4-NEXT: maxpd %xmm9, %xmm1
; SSE4-NEXT: cmpunordpd %xmm9, %xmm9
; SSE4-NEXT: movapd %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1
+; SSE4-NEXT: movapd %xmm6, %xmm2
+; SSE4-NEXT: maxpd %xmm10, %xmm2
+; SSE4-NEXT: cmpunordpd %xmm10, %xmm10
+; SSE4-NEXT: movapd %xmm10, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm2
; SSE4-NEXT: movapd %xmm7, %xmm3
-; SSE4-NEXT: maxpd %xmm8, %xmm3
-; SSE4-NEXT: cmpunordpd %xmm8, %xmm8
-; SSE4-NEXT: movapd %xmm8, %xmm0
+; SSE4-NEXT: maxpd %xmm11, %xmm3
+; SSE4-NEXT: cmpunordpd %xmm11, %xmm11
+; SSE4-NEXT: movapd %xmm11, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm3
-; SSE4-NEXT: movapd %xmm10, %xmm0
+; SSE4-NEXT: movapd %xmm8, %xmm0
; SSE4-NEXT: retq
;
; AVX1-LABEL: test_intrinsic_fmax_v8f64:
diff --git a/llvm/test/CodeGen/X86/fminnum.ll b/llvm/test/CodeGen/X86/fminnum.ll
index 6025a4f3adfdc..1290a7b819106 100644
--- a/llvm/test/CodeGen/X86/fminnum.ll
+++ b/llvm/test/CodeGen/X86/fminnum.ll
@@ -322,29 +322,29 @@ define <16 x float> @test_intrinsic_fmin_v16f32(<16 x float> %x, <16 x float> %y
;
; SSE4-LABEL: test_intrinsic_fmin_v16f32:
; SSE4: # %bb.0:
-; SSE4-NEXT: movaps %xmm3, %xmm8
-; SSE4-NEXT: movaps %xmm2, %xmm9
-; SSE4-NEXT: movaps %xmm1, %xmm2
-; SSE4-NEXT: movaps %xmm4, %xmm10
-; SSE4-NEXT: minps %xmm0, %xmm10
+; SSE4-NEXT: movaps %xmm3, %xmm11
+; SSE4-NEXT: movaps %xmm2, %xmm10
+; SSE4-NEXT: movaps %xmm1, %xmm9
+; SSE4-NEXT: movaps %xmm4, %xmm8
+; SSE4-NEXT: minps %xmm0, %xmm8
; SSE4-NEXT: cmpunordps %xmm0, %xmm0
-; SSE4-NEXT: blendvps %xmm0, %xmm4, %xmm10
+; SSE4-NEXT: blendvps %xmm0, %xmm4, %xmm8
; SSE4-NEXT: movaps %xmm5, %xmm1
-; SSE4-NEXT: minps %xmm2, %xmm1
-; SSE4-NEXT: cmpunordps %xmm2, %xmm2
-; SSE4-NEXT: movaps %xmm2, %xmm0
-; SSE4-NEXT: blendvps %xmm0, %xmm5, %xmm1
-; SSE4-NEXT: movaps %xmm6, %xmm2
-; SSE4-NEXT: minps %xmm9, %xmm2
+; SSE4-NEXT: minps %xmm9, %xmm1
; SSE4-NEXT: cmpunordps %xmm9, %xmm9
; SSE4-NEXT: movaps %xmm9, %xmm0
+; SSE4-NEXT: blendvps %xmm0, %xmm5, %xmm1
+; SSE4-NEXT: movaps %xmm6, %xmm2
+; SSE4-NEXT: minps %xmm10, %xmm2
+; SSE4-NEXT: cmpunordps %xmm10, %xmm10
+; SSE4-NEXT: movaps %xmm10, %xmm0
; SSE4-NEXT: blendvps %xmm0, %xmm6, %xmm2
; SSE4-NEXT: movaps %xmm7, %xmm3
-; SSE4-NEXT: minps %xmm8, %xmm3
-; SSE4-NEXT: cmpunordps %xmm8, %xmm8
-; SSE4-NEXT: movaps %xmm8, %xmm0
+; SSE4-NEXT: minps %xmm11, %xmm3
+; SSE4-NEXT: cmpunordps %xmm11, %xmm11
+; SSE4-NEXT: movaps %xmm11, %xmm0
; SSE4-NEXT: blendvps %xmm0, %xmm7, %xmm3
-; SSE4-NEXT: movaps %xmm10, %xmm0
+; SSE4-NEXT: movaps %xmm8, %xmm0
; SSE4-NEXT: retq
;
; AVX1-LABEL: test_intrinsic_fmin_v16f32:
@@ -471,29 +471,29 @@ define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y)
;
; SSE4-LABEL: test_intrinsic_fmin_v8f64:
; SSE4: # %bb.0:
-; SSE4-NEXT: movapd %xmm3, %xmm8
-; SSE4-NEXT: movapd %xmm2, %xmm9
-; SSE4-NEXT: movapd %xmm1, %xmm2
-; SSE4-NEXT: movapd %xmm4, %xmm10
-; SSE4-NEXT: minpd %xmm0, %xmm10
+; SSE4-NEXT: movapd %xmm3, %xmm11
+; SSE4-NEXT: movapd %xmm2, %xmm10
+; SSE4-NEXT: movapd %xmm1, %xmm9
+; SSE4-NEXT: movapd %xmm4, %xmm8
+; SSE4-NEXT: minpd %xmm0, %xmm8
; SSE4-NEXT: cmpunordpd %xmm0, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm10
+; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm8
; SSE4-NEXT: movapd %xmm5, %xmm1
-; SSE4-NEXT: minpd %xmm2, %xmm1
-; SSE4-NEXT: cmpunordpd %xmm2, %xmm2
-; SSE4-NEXT: movapd %xmm2, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1
-; SSE4-NEXT: movapd %xmm6, %xmm2
-; SSE4-NEXT: minpd %xmm9, %xmm2
+; SSE4-NEXT: minpd %xmm9, %xmm1
; SSE4-NEXT: cmpunordpd %xmm9, %xmm9
; SSE4-NEXT: movapd %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1
+; SSE4-NEXT: movapd %xmm6, %xmm2
+; SSE4-NEXT: minpd %xmm10, %xmm2
+; SSE4-NEXT: cmpunordpd %xmm10, %xmm10
+; SSE4-NEXT: movapd %xmm10, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm2
; SSE4-NEXT: movapd %xmm7, %xmm3
-; SSE4-NEXT: minpd %xmm8, %xmm3
-; SSE4-NEXT: cmpunordpd %xmm8, %xmm8
-; SSE4-NEXT: movapd %xmm8, %xmm0
+; SSE4-NEXT: minpd %xmm11, %xmm3
+; SSE4-NEXT: cmpunordpd %xmm11, %xmm11
+; SSE4-NEXT: movapd %xmm11, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm3
-; SSE4-NEXT: movapd %xmm10, %xmm0
+; SSE4-NEXT: movapd %xmm8, %xmm0
; SSE4-NEXT: retq
;
; AVX1-LABEL: test_intrinsic_fmin_v8f64:
diff --git a/llvm/test/CodeGen/X86/fp-stack-2results.ll b/llvm/test/CodeGen/X86/fp-stack-2results.ll
index 0552fc8defd74..ef0efa5a5a36a 100644
--- a/llvm/test/CodeGen/X86/fp-stack-2results.ll
+++ b/llvm/test/CodeGen/X86/fp-stack-2results.ll
@@ -68,11 +68,11 @@ define void @call1(ptr%P1, ptr%P2) {
; x86_64-NEXT: .cfi_def_cfa_offset 32
; x86_64-NEXT: .cfi_offset %rbx, -24
; x86_64-NEXT: .cfi_offset %r14, -16
-; x86_64-NEXT: movq %rsi, %r14
-; x86_64-NEXT: movq %rdi, %rbx
+; x86_64-NEXT: movq %rsi, %rbx
+; x86_64-NEXT: movq %rdi, %r14
; x86_64-NEXT: callq test at PLT
-; x86_64-NEXT: fstpt (%rbx)
; x86_64-NEXT: fstpt (%r14)
+; x86_64-NEXT: fstpt (%rbx)
; x86_64-NEXT: addq $8, %rsp
; x86_64-NEXT: .cfi_def_cfa_offset 24
; x86_64-NEXT: popq %rbx
@@ -121,12 +121,12 @@ define void @call2(ptr%P1, ptr%P2) {
; x86_64-NEXT: .cfi_def_cfa_offset 32
; x86_64-NEXT: .cfi_offset %rbx, -24
; x86_64-NEXT: .cfi_offset %r14, -16
-; x86_64-NEXT: movq %rsi, %r14
-; x86_64-NEXT: movq %rdi, %rbx
+; x86_64-NEXT: movq %rsi, %rbx
+; x86_64-NEXT: movq %rdi, %r14
; x86_64-NEXT: callq test at PLT
; x86_64-NEXT: fxch %st(1)
-; x86_64-NEXT: fstpt (%rbx)
; x86_64-NEXT: fstpt (%r14)
+; x86_64-NEXT: fstpt (%rbx)
; x86_64-NEXT: addq $8, %rsp
; x86_64-NEXT: .cfi_def_cfa_offset 24
; x86_64-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index af5b0a7c83342..caea147ec81cd 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -1175,12 +1175,12 @@ define i64 @cmp(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: movq %rsi, %r14
-; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: movq %rdi, %r14
; CHECK-NEXT: callq __eqtf2 at PLT
; CHECK-NEXT: testl %eax, %eax
-; CHECK-NEXT: cmovneq %r14, %rbx
-; CHECK-NEXT: movq %rbx, %rax
+; CHECK-NEXT: cmovneq %rbx, %r14
+; CHECK-NEXT: movq %r14, %rax
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
@@ -1221,12 +1221,12 @@ define i64 @cmps(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: movq %rsi, %r14
-; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: movq %rdi, %r14
; CHECK-NEXT: callq __eqtf2 at PLT
; CHECK-NEXT: testl %eax, %eax
-; CHECK-NEXT: cmovneq %r14, %rbx
-; CHECK-NEXT: movq %rbx, %rax
+; CHECK-NEXT: cmovneq %rbx, %r14
+; CHECK-NEXT: movq %r14, %rax
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
@@ -1270,8 +1270,8 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
; CHECK-NEXT: subq $32, %rsp
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: movq %rsi, %r14
-; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: movq %rdi, %r14
; CHECK-NEXT: callq __eqtf2 at PLT
; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: sete %bpl
@@ -1281,8 +1281,8 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: setne %al
; CHECK-NEXT: orb %bpl, %al
-; CHECK-NEXT: cmoveq %r14, %rbx
-; CHECK-NEXT: movq %rbx, %rax
+; CHECK-NEXT: cmoveq %rbx, %r14
+; CHECK-NEXT: movq %r14, %rax
; CHECK-NEXT: addq $32, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
@@ -1353,8 +1353,8 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
; CHECK-NEXT: subq $32, %rsp
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: movq %rsi, %r14
-; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: movq %rdi, %r14
; CHECK-NEXT: callq __eqtf2 at PLT
; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: setne %bpl
@@ -1364,8 +1364,8 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: sete %al
; CHECK-NEXT: testb %bpl, %al
-; CHECK-NEXT: cmoveq %r14, %rbx
-; CHECK-NEXT: movq %rbx, %rax
+; CHECK-NEXT: cmoveq %rbx, %r14
+; CHECK-NEXT: movq %r14, %rax
; CHECK-NEXT: addq $32, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
diff --git a/llvm/test/CodeGen/X86/fp128-select.ll b/llvm/test/CodeGen/X86/fp128-select.ll
index d4bc27539e644..c1df1fbca8881 100644
--- a/llvm/test/CodeGen/X86/fp128-select.ll
+++ b/llvm/test/CodeGen/X86/fp128-select.ll
@@ -92,16 +92,16 @@ define fp128 @test_select_cc(fp128, fp128) {
; NOSSE-NEXT: .cfi_offset %r14, -32
; NOSSE-NEXT: .cfi_offset %r15, -24
; NOSSE-NEXT: .cfi_offset %rbp, -16
-; NOSSE-NEXT: movq %rcx, %r12
-; NOSSE-NEXT: movq %rdx, %rbx
-; NOSSE-NEXT: movq %rsi, %r14
-; NOSSE-NEXT: movq %rdi, %r15
+; NOSSE-NEXT: movq %rcx, %r15
+; NOSSE-NEXT: movq %rdx, %r12
+; NOSSE-NEXT: movq %rsi, %rbx
+; NOSSE-NEXT: movq %rdi, %r14
; NOSSE-NEXT: callq __netf2 at PLT
; NOSSE-NEXT: movl %eax, %ebp
-; NOSSE-NEXT: movq %r15, %rdi
-; NOSSE-NEXT: movq %r14, %rsi
-; NOSSE-NEXT: movq %rbx, %rdx
-; NOSSE-NEXT: movq %r12, %rcx
+; NOSSE-NEXT: movq %r14, %rdi
+; NOSSE-NEXT: movq %rbx, %rsi
+; NOSSE-NEXT: movq %r12, %rdx
+; NOSSE-NEXT: movq %r15, %rcx
; NOSSE-NEXT: callq __eqtf2 at PLT
; NOSSE-NEXT: movl %eax, %ecx
; NOSSE-NEXT: xorl %eax, %eax
@@ -111,8 +111,8 @@ define fp128 @test_select_cc(fp128, fp128) {
; NOSSE-NEXT: testl %ebp, %ebp
; NOSSE-NEXT: je .LBB1_2
; NOSSE-NEXT: # %bb.1:
-; NOSSE-NEXT: movq %r15, %rax
-; NOSSE-NEXT: movq %r14, %rdx
+; NOSSE-NEXT: movq %r14, %rax
+; NOSSE-NEXT: movq %rbx, %rdx
; NOSSE-NEXT: .LBB1_2: # %BB2
; NOSSE-NEXT: popq %rbx
; NOSSE-NEXT: .cfi_def_cfa_offset 40
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index c6883afe07edd..c351c1b82cf19 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -161,62 +161,62 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) {
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; CHECK-NEXT: movdqa %xmm4, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; CHECK-NEXT: pxor %xmm9, %xmm9
-; CHECK-NEXT: pcmpeqd %xmm9, %xmm5
-; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; CHECK-NEXT: movdqa %xmm3, %xmm7
-; CHECK-NEXT: pcmpgtd %xmm1, %xmm7
-; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; CHECK-NEXT: pand %xmm5, %xmm6
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; CHECK-NEXT: por %xmm6, %xmm1
+; CHECK-NEXT: pxor %xmm6, %xmm6
+; CHECK-NEXT: pcmpeqd %xmm6, %xmm5
+; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295]
+; CHECK-NEXT: movdqa %xmm7, %xmm8
+; CHECK-NEXT: pcmpgtd %xmm1, %xmm8
+; CHECK-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; CHECK-NEXT: pand %xmm5, %xmm9
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
+; CHECK-NEXT: por %xmm9, %xmm1
; CHECK-NEXT: pand %xmm1, %xmm4
-; CHECK-NEXT: pandn %xmm8, %xmm1
+; CHECK-NEXT: pandn %xmm3, %xmm1
; CHECK-NEXT: por %xmm4, %xmm1
; CHECK-NEXT: movdqa %xmm2, %xmm4
; CHECK-NEXT: pxor %xmm0, %xmm4
; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; CHECK-NEXT: pcmpeqd %xmm9, %xmm5
-; CHECK-NEXT: pcmpgtd %xmm4, %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; CHECK-NEXT: pcmpeqd %xmm6, %xmm5
+; CHECK-NEXT: pcmpgtd %xmm4, %xmm7
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
; CHECK-NEXT: pand %xmm5, %xmm4
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT: por %xmm4, %xmm3
-; CHECK-NEXT: pand %xmm3, %xmm2
-; CHECK-NEXT: pandn %xmm8, %xmm3
-; CHECK-NEXT: por %xmm2, %xmm3
-; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968]
-; CHECK-NEXT: movdqa %xmm3, %xmm4
-; CHECK-NEXT: pxor %xmm0, %xmm4
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; CHECK-NEXT: por %xmm4, %xmm5
+; CHECK-NEXT: pand %xmm5, %xmm2
+; CHECK-NEXT: pandn %xmm3, %xmm5
+; CHECK-NEXT: por %xmm2, %xmm5
+; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
+; CHECK-NEXT: movdqa %xmm5, %xmm3
+; CHECK-NEXT: pxor %xmm0, %xmm3
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; CHECK-NEXT: pcmpeqd %xmm6, %xmm6
-; CHECK-NEXT: pcmpeqd %xmm6, %xmm5
+; CHECK-NEXT: pcmpeqd %xmm6, %xmm4
; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320]
-; CHECK-NEXT: pcmpgtd %xmm7, %xmm4
-; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2]
-; CHECK-NEXT: pand %xmm5, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-NEXT: por %xmm2, %xmm4
-; CHECK-NEXT: pand %xmm4, %xmm3
-; CHECK-NEXT: pandn %xmm8, %xmm4
-; CHECK-NEXT: por %xmm3, %xmm4
+; CHECK-NEXT: pcmpgtd %xmm7, %xmm3
+; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; CHECK-NEXT: pand %xmm4, %xmm8
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-NEXT: por %xmm8, %xmm3
+; CHECK-NEXT: pand %xmm3, %xmm5
+; CHECK-NEXT: pandn %xmm2, %xmm3
+; CHECK-NEXT: por %xmm5, %xmm3
; CHECK-NEXT: pxor %xmm1, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT: pcmpeqd %xmm6, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-NEXT: pcmpeqd %xmm6, %xmm4
; CHECK-NEXT: pcmpgtd %xmm7, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; CHECK-NEXT: pand %xmm2, %xmm3
+; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; CHECK-NEXT: pand %xmm4, %xmm5
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT: por %xmm3, %xmm0
+; CHECK-NEXT: por %xmm5, %xmm0
; CHECK-NEXT: pand %xmm0, %xmm1
-; CHECK-NEXT: pandn %xmm8, %xmm0
+; CHECK-NEXT: pandn %xmm2, %xmm0
; CHECK-NEXT: por %xmm1, %xmm0
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; CHECK-NEXT: retq
entry:
%conv = fptosi <4 x float> %x to <4 x i64>
@@ -328,56 +328,56 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) {
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; CHECK-NEXT: movdqa %xmm4, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; CHECK-NEXT: pxor %xmm9, %xmm9
-; CHECK-NEXT: pcmpeqd %xmm9, %xmm5
-; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
-; CHECK-NEXT: movdqa %xmm3, %xmm7
-; CHECK-NEXT: pcmpgtd %xmm1, %xmm7
-; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; CHECK-NEXT: pand %xmm5, %xmm6
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; CHECK-NEXT: por %xmm6, %xmm1
+; CHECK-NEXT: pxor %xmm6, %xmm6
+; CHECK-NEXT: pcmpeqd %xmm6, %xmm5
+; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647]
+; CHECK-NEXT: movdqa %xmm7, %xmm8
+; CHECK-NEXT: pcmpgtd %xmm1, %xmm8
+; CHECK-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; CHECK-NEXT: pand %xmm5, %xmm9
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
+; CHECK-NEXT: por %xmm9, %xmm1
; CHECK-NEXT: pand %xmm1, %xmm4
-; CHECK-NEXT: pandn %xmm8, %xmm1
+; CHECK-NEXT: pandn %xmm3, %xmm1
; CHECK-NEXT: por %xmm4, %xmm1
; CHECK-NEXT: movdqa %xmm2, %xmm4
; CHECK-NEXT: pxor %xmm0, %xmm4
; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; CHECK-NEXT: pcmpeqd %xmm9, %xmm5
-; CHECK-NEXT: pcmpgtd %xmm4, %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; CHECK-NEXT: pcmpeqd %xmm6, %xmm5
+; CHECK-NEXT: pcmpgtd %xmm4, %xmm7
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
; CHECK-NEXT: pand %xmm5, %xmm4
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT: por %xmm4, %xmm3
+; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; CHECK-NEXT: por %xmm4, %xmm5
+; CHECK-NEXT: pand %xmm5, %xmm2
+; CHECK-NEXT: pandn %xmm3, %xmm5
+; CHECK-NEXT: por %xmm2, %xmm5
+; CHECK-NEXT: movdqa %xmm5, %xmm2
+; CHECK-NEXT: pxor %xmm0, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm3
+; CHECK-NEXT: pcmpgtd %xmm0, %xmm3
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-NEXT: pand %xmm3, %xmm2
-; CHECK-NEXT: pandn %xmm8, %xmm3
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; CHECK-NEXT: por %xmm2, %xmm3
-; CHECK-NEXT: movdqa %xmm3, %xmm2
+; CHECK-NEXT: pand %xmm5, %xmm3
+; CHECK-NEXT: movdqa %xmm1, %xmm2
; CHECK-NEXT: pxor %xmm0, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm4
; CHECK-NEXT: pcmpgtd %xmm0, %xmm4
; CHECK-NEXT: pcmpeqd %xmm0, %xmm2
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-NEXT: pand %xmm4, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-NEXT: por %xmm2, %xmm4
-; CHECK-NEXT: pand %xmm3, %xmm4
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: pxor %xmm0, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm3
-; CHECK-NEXT: pcmpgtd %xmm0, %xmm3
-; CHECK-NEXT: pcmpeqd %xmm0, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT: pand %xmm3, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; CHECK-NEXT: por %xmm2, %xmm0
; CHECK-NEXT: pand %xmm1, %xmm0
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; CHECK-NEXT: retq
entry:
%conv = fptosi <4 x float> %x to <4 x i64>
@@ -424,10 +424,10 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647]
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; CHECK-NEXT: movdqa %xmm3, %xmm1
-; CHECK-NEXT: movdqa %xmm3, %xmm2
+; CHECK-NEXT: movdqa %xmm3, %xmm8
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; CHECK-NEXT: pxor %xmm4, %xmm4
@@ -439,9 +439,9 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
; CHECK-NEXT: pand %xmm3, %xmm7
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
; CHECK-NEXT: por %xmm7, %xmm1
-; CHECK-NEXT: pand %xmm1, %xmm2
-; CHECK-NEXT: pandn %xmm8, %xmm1
-; CHECK-NEXT: por %xmm2, %xmm1
+; CHECK-NEXT: pand %xmm1, %xmm8
+; CHECK-NEXT: pandn %xmm2, %xmm1
+; CHECK-NEXT: por %xmm8, %xmm1
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; CHECK-NEXT: movdqa %xmm7, %xmm3
; CHECK-NEXT: pxor %xmm0, %xmm3
@@ -454,9 +454,9 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
; CHECK-NEXT: por %xmm3, %xmm4
; CHECK-NEXT: movdqa %xmm7, %xmm3
; CHECK-NEXT: pand %xmm4, %xmm3
-; CHECK-NEXT: pandn %xmm8, %xmm4
+; CHECK-NEXT: pandn %xmm2, %xmm4
; CHECK-NEXT: por %xmm3, %xmm4
-; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968]
+; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
; CHECK-NEXT: movdqa %xmm4, %xmm3
; CHECK-NEXT: pxor %xmm0, %xmm3
; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
@@ -464,23 +464,23 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
; CHECK-NEXT: pcmpeqd %xmm6, %xmm5
; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320]
; CHECK-NEXT: pcmpgtd %xmm7, %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; CHECK-NEXT: pand %xmm5, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; CHECK-NEXT: pand %xmm5, %xmm8
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT: por %xmm2, %xmm3
+; CHECK-NEXT: por %xmm8, %xmm3
; CHECK-NEXT: pand %xmm3, %xmm4
-; CHECK-NEXT: pandn %xmm8, %xmm3
+; CHECK-NEXT: pandn %xmm2, %xmm3
; CHECK-NEXT: por %xmm4, %xmm3
; CHECK-NEXT: pxor %xmm1, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT: pcmpeqd %xmm6, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-NEXT: pcmpeqd %xmm6, %xmm4
; CHECK-NEXT: pcmpgtd %xmm7, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; CHECK-NEXT: pand %xmm2, %xmm4
+; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; CHECK-NEXT: pand %xmm4, %xmm5
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT: por %xmm4, %xmm0
+; CHECK-NEXT: por %xmm5, %xmm0
; CHECK-NEXT: pand %xmm0, %xmm1
-; CHECK-NEXT: pandn %xmm8, %xmm0
+; CHECK-NEXT: pandn %xmm2, %xmm0
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; CHECK-NEXT: addq $72, %rsp
@@ -633,10 +633,10 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; CHECK-NEXT: movdqa %xmm3, %xmm1
-; CHECK-NEXT: movdqa %xmm3, %xmm2
+; CHECK-NEXT: movdqa %xmm3, %xmm8
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; CHECK-NEXT: pxor %xmm4, %xmm4
@@ -648,9 +648,9 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NEXT: pand %xmm3, %xmm7
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
; CHECK-NEXT: por %xmm7, %xmm1
-; CHECK-NEXT: pand %xmm1, %xmm2
-; CHECK-NEXT: pandn %xmm8, %xmm1
-; CHECK-NEXT: por %xmm2, %xmm1
+; CHECK-NEXT: pand %xmm1, %xmm8
+; CHECK-NEXT: pandn %xmm2, %xmm1
+; CHECK-NEXT: por %xmm8, %xmm1
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; CHECK-NEXT: movdqa %xmm7, %xmm3
; CHECK-NEXT: pxor %xmm0, %xmm3
@@ -663,7 +663,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NEXT: por %xmm3, %xmm4
; CHECK-NEXT: movdqa %xmm7, %xmm3
; CHECK-NEXT: pand %xmm4, %xmm3
-; CHECK-NEXT: pandn %xmm8, %xmm4
+; CHECK-NEXT: pandn %xmm2, %xmm4
; CHECK-NEXT: por %xmm3, %xmm4
; CHECK-NEXT: movdqa %xmm4, %xmm2
; CHECK-NEXT: pxor %xmm0, %xmm2
@@ -1131,8 +1131,8 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: callq __fixdfti at PLT
-; CHECK-NEXT: movq %rax, %r14
-; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rax, %rbx
+; CHECK-NEXT: movq %rdx, %r14
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __fixdfti at PLT
; CHECK-NEXT: xorl %ecx, %ecx
@@ -1142,20 +1142,20 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
; CHECK-NEXT: sbbq $0, %rdi
; CHECK-NEXT: cmovgeq %rcx, %rdx
; CHECK-NEXT: cmovgeq %rsi, %rax
-; CHECK-NEXT: cmpq %rsi, %r14
-; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: cmpq %rsi, %rbx
+; CHECK-NEXT: movq %r14, %rdi
; CHECK-NEXT: sbbq $0, %rdi
-; CHECK-NEXT: cmovlq %rbx, %rcx
-; CHECK-NEXT: cmovlq %r14, %rsi
-; CHECK-NEXT: movabsq $-9223372036854775808, %r8 # imm = 0x8000000000000000
-; CHECK-NEXT: cmpq %rsi, %r8
-; CHECK-NEXT: movq $-1, %rbx
-; CHECK-NEXT: movq $-1, %rdi
-; CHECK-NEXT: sbbq %rcx, %rdi
-; CHECK-NEXT: cmovgeq %r8, %rsi
-; CHECK-NEXT: cmpq %rax, %r8
-; CHECK-NEXT: sbbq %rdx, %rbx
-; CHECK-NEXT: cmovgeq %r8, %rax
+; CHECK-NEXT: cmovlq %r14, %rcx
+; CHECK-NEXT: cmovlq %rbx, %rsi
+; CHECK-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: movq $-1, %r8
+; CHECK-NEXT: movq $-1, %r9
+; CHECK-NEXT: sbbq %rcx, %r9
+; CHECK-NEXT: cmovgeq %rdi, %rsi
+; CHECK-NEXT: cmpq %rax, %rdi
+; CHECK-NEXT: sbbq %rdx, %r8
+; CHECK-NEXT: cmovgeq %rdi, %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: movq %rsi, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1287,8 +1287,8 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; CHECK-NEXT: callq __fixsfti at PLT
-; CHECK-NEXT: movq %rax, %r14
-; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rax, %rbx
+; CHECK-NEXT: movq %rdx, %r14
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __fixsfti at PLT
; CHECK-NEXT: xorl %ecx, %ecx
@@ -1298,20 +1298,20 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
; CHECK-NEXT: sbbq $0, %rdi
; CHECK-NEXT: cmovgeq %rcx, %rdx
; CHECK-NEXT: cmovgeq %rsi, %rax
-; CHECK-NEXT: cmpq %rsi, %r14
-; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: cmpq %rsi, %rbx
+; CHECK-NEXT: movq %r14, %rdi
; CHECK-NEXT: sbbq $0, %rdi
-; CHECK-NEXT: cmovlq %rbx, %rcx
-; CHECK-NEXT: cmovlq %r14, %rsi
-; CHECK-NEXT: movabsq $-9223372036854775808, %r8 # imm = 0x8000000000000000
-; CHECK-NEXT: cmpq %rsi, %r8
-; CHECK-NEXT: movq $-1, %rbx
-; CHECK-NEXT: movq $-1, %rdi
-; CHECK-NEXT: sbbq %rcx, %rdi
-; CHECK-NEXT: cmovgeq %r8, %rsi
-; CHECK-NEXT: cmpq %rax, %r8
-; CHECK-NEXT: sbbq %rdx, %rbx
-; CHECK-NEXT: cmovgeq %r8, %rax
+; CHECK-NEXT: cmovlq %r14, %rcx
+; CHECK-NEXT: cmovlq %rbx, %rsi
+; CHECK-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: movq $-1, %r8
+; CHECK-NEXT: movq $-1, %r9
+; CHECK-NEXT: sbbq %rcx, %r9
+; CHECK-NEXT: cmovgeq %rdi, %rsi
+; CHECK-NEXT: cmpq %rax, %rdi
+; CHECK-NEXT: sbbq %rdx, %r8
+; CHECK-NEXT: cmovgeq %rdi, %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: movq %rsi, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1443,8 +1443,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: psrld $16, %xmm0
; CHECK-NEXT: callq __fixhfti at PLT
-; CHECK-NEXT: movq %rax, %r14
-; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rax, %rbx
+; CHECK-NEXT: movq %rdx, %r14
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __fixhfti at PLT
; CHECK-NEXT: xorl %ecx, %ecx
@@ -1454,20 +1454,20 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-NEXT: sbbq $0, %rdi
; CHECK-NEXT: cmovgeq %rcx, %rdx
; CHECK-NEXT: cmovgeq %rsi, %rax
-; CHECK-NEXT: cmpq %rsi, %r14
-; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: cmpq %rsi, %rbx
+; CHECK-NEXT: movq %r14, %rdi
; CHECK-NEXT: sbbq $0, %rdi
-; CHECK-NEXT: cmovlq %rbx, %rcx
-; CHECK-NEXT: cmovlq %r14, %rsi
-; CHECK-NEXT: movabsq $-9223372036854775808, %r8 # imm = 0x8000000000000000
-; CHECK-NEXT: cmpq %rsi, %r8
-; CHECK-NEXT: movq $-1, %rbx
-; CHECK-NEXT: movq $-1, %rdi
-; CHECK-NEXT: sbbq %rcx, %rdi
-; CHECK-NEXT: cmovgeq %r8, %rsi
-; CHECK-NEXT: cmpq %rax, %r8
-; CHECK-NEXT: sbbq %rdx, %rbx
-; CHECK-NEXT: cmovgeq %r8, %rax
+; CHECK-NEXT: cmovlq %r14, %rcx
+; CHECK-NEXT: cmovlq %rbx, %rsi
+; CHECK-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: movq $-1, %r8
+; CHECK-NEXT: movq $-1, %r9
+; CHECK-NEXT: sbbq %rcx, %r9
+; CHECK-NEXT: cmovgeq %rdi, %rsi
+; CHECK-NEXT: cmpq %rax, %rdi
+; CHECK-NEXT: sbbq %rdx, %r8
+; CHECK-NEXT: cmovgeq %rdi, %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: movq %rsi, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1747,26 +1747,26 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) {
; CHECK-NEXT: movdqa %xmm3, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; CHECK-NEXT: pxor %xmm8, %xmm8
-; CHECK-NEXT: pcmpeqd %xmm8, %xmm4
+; CHECK-NEXT: pxor %xmm5, %xmm5
+; CHECK-NEXT: pcmpeqd %xmm5, %xmm4
; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
; CHECK-NEXT: movdqa %xmm6, %xmm7
; CHECK-NEXT: pcmpgtd %xmm1, %xmm7
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
-; CHECK-NEXT: pand %xmm4, %xmm5
+; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; CHECK-NEXT: pand %xmm4, %xmm8
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; CHECK-NEXT: por %xmm5, %xmm1
+; CHECK-NEXT: por %xmm8, %xmm1
; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647]
; CHECK-NEXT: pand %xmm1, %xmm3
; CHECK-NEXT: pandn %xmm4, %xmm1
; CHECK-NEXT: por %xmm3, %xmm1
; CHECK-NEXT: movdqa %xmm2, %xmm3
; CHECK-NEXT: pxor %xmm0, %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; CHECK-NEXT: pcmpeqd %xmm8, %xmm5
+; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; CHECK-NEXT: pcmpeqd %xmm5, %xmm7
; CHECK-NEXT: pcmpgtd %xmm3, %xmm6
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; CHECK-NEXT: pand %xmm5, %xmm3
+; CHECK-NEXT: pand %xmm7, %xmm3
; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; CHECK-NEXT: por %xmm3, %xmm5
; CHECK-NEXT: pand %xmm5, %xmm2
@@ -1911,26 +1911,26 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) {
; CHECK-NEXT: movdqa %xmm3, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; CHECK-NEXT: pxor %xmm8, %xmm8
-; CHECK-NEXT: pcmpeqd %xmm8, %xmm4
+; CHECK-NEXT: pxor %xmm5, %xmm5
+; CHECK-NEXT: pcmpeqd %xmm5, %xmm4
; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647]
; CHECK-NEXT: movdqa %xmm6, %xmm7
; CHECK-NEXT: pcmpgtd %xmm1, %xmm7
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
-; CHECK-NEXT: pand %xmm4, %xmm5
+; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; CHECK-NEXT: pand %xmm4, %xmm8
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; CHECK-NEXT: por %xmm5, %xmm1
+; CHECK-NEXT: por %xmm8, %xmm1
; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
; CHECK-NEXT: pand %xmm1, %xmm3
; CHECK-NEXT: pandn %xmm4, %xmm1
; CHECK-NEXT: por %xmm3, %xmm1
; CHECK-NEXT: movdqa %xmm2, %xmm3
; CHECK-NEXT: pxor %xmm0, %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; CHECK-NEXT: pcmpeqd %xmm8, %xmm5
+; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; CHECK-NEXT: pcmpeqd %xmm5, %xmm7
; CHECK-NEXT: pcmpgtd %xmm3, %xmm6
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; CHECK-NEXT: pand %xmm5, %xmm3
+; CHECK-NEXT: pand %xmm7, %xmm3
; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; CHECK-NEXT: por %xmm3, %xmm5
; CHECK-NEXT: pand %xmm5, %xmm2
@@ -2709,20 +2709,20 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
; CHECK-NEXT: cmoveq %rsi, %rcx
; CHECK-NEXT: cmovsq %r14, %rdi
; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000
-; CHECK-NEXT: movq %rbx, %rsi
-; CHECK-NEXT: cmovnsq %rcx, %rsi
-; CHECK-NEXT: cmpq %rbx, %rcx
-; CHECK-NEXT: cmovbeq %rbx, %rcx
+; CHECK-NEXT: movabsq $-9223372036854775808, %rsi # imm = 0x8000000000000000
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: cmovnsq %rcx, %r8
+; CHECK-NEXT: cmpq %rsi, %rcx
+; CHECK-NEXT: cmovbeq %rsi, %rcx
; CHECK-NEXT: cmpq $-1, %rdi
-; CHECK-NEXT: cmovneq %rsi, %rcx
+; CHECK-NEXT: cmovneq %r8, %rcx
; CHECK-NEXT: testq %rdx, %rdx
-; CHECK-NEXT: movq %rbx, %rsi
-; CHECK-NEXT: cmovnsq %rax, %rsi
-; CHECK-NEXT: cmpq %rbx, %rax
-; CHECK-NEXT: cmovbeq %rbx, %rax
+; CHECK-NEXT: movq %rsi, %rdi
+; CHECK-NEXT: cmovnsq %rax, %rdi
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: cmovbeq %rsi, %rax
; CHECK-NEXT: cmpq $-1, %rdx
-; CHECK-NEXT: cmovneq %rsi, %rax
+; CHECK-NEXT: cmovneq %rdi, %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: movq %rcx, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2873,20 +2873,20 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
; CHECK-NEXT: cmoveq %rsi, %rcx
; CHECK-NEXT: cmovsq %r14, %rdi
; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000
-; CHECK-NEXT: movq %rbx, %rsi
-; CHECK-NEXT: cmovnsq %rcx, %rsi
-; CHECK-NEXT: cmpq %rbx, %rcx
-; CHECK-NEXT: cmovbeq %rbx, %rcx
+; CHECK-NEXT: movabsq $-9223372036854775808, %rsi # imm = 0x8000000000000000
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: cmovnsq %rcx, %r8
+; CHECK-NEXT: cmpq %rsi, %rcx
+; CHECK-NEXT: cmovbeq %rsi, %rcx
; CHECK-NEXT: cmpq $-1, %rdi
-; CHECK-NEXT: cmovneq %rsi, %rcx
+; CHECK-NEXT: cmovneq %r8, %rcx
; CHECK-NEXT: testq %rdx, %rdx
-; CHECK-NEXT: movq %rbx, %rsi
-; CHECK-NEXT: cmovnsq %rax, %rsi
-; CHECK-NEXT: cmpq %rbx, %rax
-; CHECK-NEXT: cmovbeq %rbx, %rax
+; CHECK-NEXT: movq %rsi, %rdi
+; CHECK-NEXT: cmovnsq %rax, %rdi
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: cmovbeq %rsi, %rax
; CHECK-NEXT: cmpq $-1, %rdx
-; CHECK-NEXT: cmovneq %rsi, %rax
+; CHECK-NEXT: cmovneq %rdi, %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: movq %rcx, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -3037,20 +3037,20 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
; CHECK-NEXT: cmoveq %rsi, %rcx
; CHECK-NEXT: cmovsq %r14, %rdi
; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000
-; CHECK-NEXT: movq %rbx, %rsi
-; CHECK-NEXT: cmovnsq %rcx, %rsi
-; CHECK-NEXT: cmpq %rbx, %rcx
-; CHECK-NEXT: cmovbeq %rbx, %rcx
+; CHECK-NEXT: movabsq $-9223372036854775808, %rsi # imm = 0x8000000000000000
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: cmovnsq %rcx, %r8
+; CHECK-NEXT: cmpq %rsi, %rcx
+; CHECK-NEXT: cmovbeq %rsi, %rcx
; CHECK-NEXT: cmpq $-1, %rdi
-; CHECK-NEXT: cmovneq %rsi, %rcx
+; CHECK-NEXT: cmovneq %r8, %rcx
; CHECK-NEXT: testq %rdx, %rdx
-; CHECK-NEXT: movq %rbx, %rsi
-; CHECK-NEXT: cmovnsq %rax, %rsi
-; CHECK-NEXT: cmpq %rbx, %rax
-; CHECK-NEXT: cmovbeq %rbx, %rax
+; CHECK-NEXT: movq %rsi, %rdi
+; CHECK-NEXT: cmovnsq %rax, %rdi
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: cmovbeq %rsi, %rax
; CHECK-NEXT: cmpq $-1, %rdx
-; CHECK-NEXT: cmovneq %rsi, %rax
+; CHECK-NEXT: cmovneq %rdi, %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: movq %rcx, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
index f563d93c97f2a..d156239efee66 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
@@ -234,44 +234,44 @@ define <4 x i128> @test_signed_v4i128_v4f32(<4 x float> %f) nounwind {
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $56, %rsp
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: movq %rdi, %r12
+; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: callq __fixsfti at PLT
; CHECK-NEXT: movq %rdx, %r15
-; CHECK-NEXT: xorl %ebp, %ebp
+; CHECK-NEXT: xorl %r14d, %r14d
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %rbp, %rax
+; CHECK-NEXT: cmovbq %r14, %rax
; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
; CHECK-NEXT: cmovbq %rcx, %r15
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: movabsq $9223372036854775807, %r14 # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: cmovaq %r14, %r15
+; CHECK-NEXT: movabsq $9223372036854775807, %rbp # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: cmovaq %rbp, %r15
; CHECK-NEXT: movq $-1, %rcx
; CHECK-NEXT: cmovaq %rcx, %rax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpq %rbp, %rax
+; CHECK-NEXT: cmovpq %r14, %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: cmovpq %rbp, %r15
+; CHECK-NEXT: cmovpq %r14, %r15
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: callq __fixsfti at PLT
-; CHECK-NEXT: movq %rax, %r13
-; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rax, %r12
+; CHECK-NEXT: movq %rdx, %r13
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %rbp, %r13
+; CHECK-NEXT: cmovbq %r14, %r12
; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT: cmovbq %rax, %rbx
+; CHECK-NEXT: cmovbq %rax, %r13
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %r14, %rbx
+; CHECK-NEXT: cmovaq %rbp, %r13
; CHECK-NEXT: movq $-1, %rax
-; CHECK-NEXT: cmovaq %rax, %r13
+; CHECK-NEXT: cmovaq %rax, %r12
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpq %rbp, %r13
-; CHECK-NEXT: cmovpq %rbp, %rbx
+; CHECK-NEXT: cmovpq %r14, %r12
+; CHECK-NEXT: cmovpq %r14, %r13
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -309,16 +309,16 @@ define <4 x i128> @test_signed_v4i128_v4f32(<4 x float> %f) nounwind {
; CHECK-NEXT: cmovpq %rsi, %rax
; CHECK-NEXT: movl $0, %ecx
; CHECK-NEXT: cmovpq %rcx, %rdx
-; CHECK-NEXT: movq %rdx, 8(%r12)
-; CHECK-NEXT: movq %rax, (%r12)
-; CHECK-NEXT: movq %r14, 56(%r12)
-; CHECK-NEXT: movq %rbp, 48(%r12)
-; CHECK-NEXT: movq %rbx, 40(%r12)
-; CHECK-NEXT: movq %r13, 32(%r12)
-; CHECK-NEXT: movq %r15, 24(%r12)
+; CHECK-NEXT: movq %rdx, 8(%rbx)
+; CHECK-NEXT: movq %rax, (%rbx)
+; CHECK-NEXT: movq %r14, 56(%rbx)
+; CHECK-NEXT: movq %rbp, 48(%rbx)
+; CHECK-NEXT: movq %r13, 40(%rbx)
+; CHECK-NEXT: movq %r12, 32(%rbx)
+; CHECK-NEXT: movq %r15, 24(%rbx)
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT: movq %rax, 16(%r12)
-; CHECK-NEXT: movq %r12, %rax
+; CHECK-NEXT: movq %rax, 16(%rbx)
+; CHECK-NEXT: movq %rbx, %rax
; CHECK-NEXT: addq $56, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
@@ -477,26 +477,26 @@ define <2 x i128> @test_signed_v2i128_v2f64(<2 x double> %f) nounwind {
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: callq __fixdfti at PLT
-; CHECK-NEXT: movq %rax, %r15
-; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rax, %r14
+; CHECK-NEXT: movq %rdx, %r15
; CHECK-NEXT: xorl %r12d, %r12d
; CHECK-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r12, %r15
+; CHECK-NEXT: cmovbq %r12, %r14
; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT: cmovbq %rax, %rbx
+; CHECK-NEXT: cmovbq %rax, %r15
; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movabsq $9223372036854775807, %rbp # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: cmovaq %rbp, %rbx
+; CHECK-NEXT: cmovaq %rbp, %r15
; CHECK-NEXT: movq $-1, %r13
-; CHECK-NEXT: cmovaq %r13, %r15
+; CHECK-NEXT: cmovaq %r13, %r14
; CHECK-NEXT: ucomisd %xmm0, %xmm0
+; CHECK-NEXT: cmovpq %r12, %r14
; CHECK-NEXT: cmovpq %r12, %r15
-; CHECK-NEXT: cmovpq %r12, %rbx
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __fixdfti at PLT
; CHECK-NEXT: movapd (%rsp), %xmm0 # 16-byte Reload
@@ -510,11 +510,11 @@ define <2 x i128> @test_signed_v2i128_v2f64(<2 x double> %f) nounwind {
; CHECK-NEXT: ucomisd %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r12, %rax
; CHECK-NEXT: cmovpq %r12, %rdx
-; CHECK-NEXT: movq %rdx, 8(%r14)
-; CHECK-NEXT: movq %rax, (%r14)
-; CHECK-NEXT: movq %rbx, 24(%r14)
-; CHECK-NEXT: movq %r15, 16(%r14)
-; CHECK-NEXT: movq %r14, %rax
+; CHECK-NEXT: movq %rdx, 8(%rbx)
+; CHECK-NEXT: movq %rax, (%rbx)
+; CHECK-NEXT: movq %r15, 24(%rbx)
+; CHECK-NEXT: movq %r14, 16(%rbx)
+; CHECK-NEXT: movq %rbx, %rax
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
@@ -675,103 +675,103 @@ define <8 x i8> @test_signed_v8i8_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: psrlq $48, %xmm0
; CHECK-NEXT: callq __extendhfsf2 at PLT
-; CHECK-NEXT: cvttss2si %xmm0, %ebp
+; CHECK-NEXT: cvttss2si %xmm0, %r12d
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: movl $128, %r14d
-; CHECK-NEXT: cmovbl %r14d, %ebp
+; CHECK-NEXT: movl $128, %ebx
+; CHECK-NEXT: cmovbl %ebx, %r12d
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: movl $127, %r12d
-; CHECK-NEXT: cmoval %r12d, %ebp
-; CHECK-NEXT: xorl %r15d, %r15d
+; CHECK-NEXT: movl $127, %ebp
+; CHECK-NEXT: cmoval %ebp, %r12d
+; CHECK-NEXT: xorl %r14d, %r14d
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %r15d, %ebp
-; CHECK-NEXT: shll $8, %ebp
+; CHECK-NEXT: cmovpl %r14d, %r12d
+; CHECK-NEXT: shll $8, %r12d
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r12d, %eax
+; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %r15d, %eax
-; CHECK-NEXT: movzbl %al, %ebx
-; CHECK-NEXT: orl %ebp, %ebx
+; CHECK-NEXT: cmovpl %r14d, %eax
+; CHECK-NEXT: movzbl %al, %r15d
+; CHECK-NEXT: orl %r12d, %r15d
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r12d, %eax
+; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %r15d, %eax
-; CHECK-NEXT: movzbl %al, %ebp
+; CHECK-NEXT: cmovpl %r14d, %eax
+; CHECK-NEXT: movzbl %al, %r12d
; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: psrld $16, %xmm0
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r12d, %eax
+; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %r15d, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: shll $8, %eax
-; CHECK-NEXT: orl %ebp, %eax
+; CHECK-NEXT: orl %r12d, %eax
; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: pinsrw $1, %ebx, %xmm0
+; CHECK-NEXT: pinsrw $1, %r15d, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: callq __extendhfsf2 at PLT
-; CHECK-NEXT: cvttss2si %xmm0, %ebx
+; CHECK-NEXT: cvttss2si %xmm0, %r15d
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %ebx
+; CHECK-NEXT: cmovbl %ebx, %r15d
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r12d, %ebx
+; CHECK-NEXT: cmoval %ebp, %r15d
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %r15d, %ebx
-; CHECK-NEXT: shll $8, %ebx
+; CHECK-NEXT: cmovpl %r14d, %r15d
+; CHECK-NEXT: shll $8, %r15d
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r12d, %eax
+; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %r15d, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: orl %ebx, %eax
+; CHECK-NEXT: orl %r15d, %eax
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: pinsrw $2, %eax, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: callq __extendhfsf2 at PLT
-; CHECK-NEXT: cvttss2si %xmm0, %ebx
+; CHECK-NEXT: cvttss2si %xmm0, %r15d
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %ebx
+; CHECK-NEXT: cmovbl %ebx, %r15d
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r12d, %ebx
+; CHECK-NEXT: cmoval %ebp, %r15d
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %r15d, %ebx
-; CHECK-NEXT: shll $8, %ebx
+; CHECK-NEXT: cmovpl %r14d, %r15d
+; CHECK-NEXT: shll $8, %r15d
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r12d, %eax
+; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %r15d, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: orl %ebx, %eax
+; CHECK-NEXT: orl %r15d, %eax
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: pinsrw $3, %eax, %xmm0
; CHECK-NEXT: addq $32, %rsp
@@ -797,14 +797,14 @@ define <8 x i16> @test_signed_v8i16_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: movl $32768, %r14d # imm = 0x8000
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: movl $32768, %ebx # imm = 0x8000
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movl $32767, %ebp # imm = 0x7FFF
; CHECK-NEXT: cmoval %ebp, %eax
-; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: xorl %r14d, %r14d
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -812,11 +812,11 @@ define <8 x i16> @test_signed_v8i16_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
@@ -826,11 +826,11 @@ define <8 x i16> @test_signed_v8i16_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -838,11 +838,11 @@ define <8 x i16> @test_signed_v8i16_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
@@ -854,11 +854,11 @@ define <8 x i16> @test_signed_v8i16_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -866,11 +866,11 @@ define <8 x i16> @test_signed_v8i16_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
@@ -879,11 +879,11 @@ define <8 x i16> @test_signed_v8i16_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
@@ -891,11 +891,11 @@ define <8 x i16> @test_signed_v8i16_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -924,14 +924,14 @@ define <8 x i32> @test_signed_v8i32_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: movl $-2147483648, %r14d # imm = 0x80000000
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: movl $-2147483648, %ebx # imm = 0x80000000
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF
; CHECK-NEXT: cmoval %ebp, %eax
-; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: xorl %r14d, %r14d
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -939,11 +939,11 @@ define <8 x i32> @test_signed_v8i32_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
@@ -952,11 +952,11 @@ define <8 x i32> @test_signed_v8i32_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
@@ -964,11 +964,11 @@ define <8 x i32> @test_signed_v8i32_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -980,11 +980,11 @@ define <8 x i32> @test_signed_v8i32_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -992,11 +992,11 @@ define <8 x i32> @test_signed_v8i32_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
@@ -1006,11 +1006,11 @@ define <8 x i32> @test_signed_v8i32_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -1018,11 +1018,11 @@ define <8 x i32> @test_signed_v8i32_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
-; CHECK-NEXT: cmovpl %ebx, %eax
+; CHECK-NEXT: cmovpl %r14d, %eax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
@@ -1049,11 +1049,11 @@ define <8 x i64> @test_signed_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: movabsq $-9223372036854775808, %r14 # imm = 0x8000000000000000
-; CHECK-NEXT: cmovbq %r14, %rax
+; CHECK-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000
+; CHECK-NEXT: cmovbq %rbx, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: movabsq $9223372036854775807, %rbx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: cmovaq %rbx, %rax
+; CHECK-NEXT: movabsq $9223372036854775807, %r14 # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: cmovaq %r14, %rax
; CHECK-NEXT: xorl %r15d, %r15d
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r15, %rax
@@ -1064,9 +1064,9 @@ define <8 x i64> @test_signed_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rax
+; CHECK-NEXT: cmovbq %rbx, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rax
+; CHECK-NEXT: cmovaq %r14, %rax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r15, %rax
; CHECK-NEXT: movq %rax, %xmm0
@@ -1078,9 +1078,9 @@ define <8 x i64> @test_signed_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rax
+; CHECK-NEXT: cmovbq %rbx, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rax
+; CHECK-NEXT: cmovaq %r14, %rax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r15, %rax
; CHECK-NEXT: movq %rax, %xmm0
@@ -1090,9 +1090,9 @@ define <8 x i64> @test_signed_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rax
+; CHECK-NEXT: cmovbq %rbx, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rax
+; CHECK-NEXT: cmovaq %r14, %rax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r15, %rax
; CHECK-NEXT: movq %rax, %xmm0
@@ -1104,9 +1104,9 @@ define <8 x i64> @test_signed_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rax
+; CHECK-NEXT: cmovbq %rbx, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rax
+; CHECK-NEXT: cmovaq %r14, %rax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r15, %rax
; CHECK-NEXT: movq %rax, %xmm0
@@ -1116,9 +1116,9 @@ define <8 x i64> @test_signed_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rax
+; CHECK-NEXT: cmovbq %rbx, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rax
+; CHECK-NEXT: cmovaq %r14, %rax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r15, %rax
; CHECK-NEXT: movq %rax, %xmm0
@@ -1130,9 +1130,9 @@ define <8 x i64> @test_signed_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rax
+; CHECK-NEXT: cmovbq %rbx, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rax
+; CHECK-NEXT: cmovaq %r14, %rax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r15, %rax
; CHECK-NEXT: movq %rax, %xmm0
@@ -1142,9 +1142,9 @@ define <8 x i64> @test_signed_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rax
+; CHECK-NEXT: cmovbq %rbx, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rax
+; CHECK-NEXT: cmovaq %r14, %rax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r15, %rax
; CHECK-NEXT: movq %rax, %xmm3
@@ -1189,10 +1189,10 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
; CHECK-NEXT: cmovaq %rcx, %rdx
-; CHECK-NEXT: movq %rcx, %rbp
+; CHECK-NEXT: movq %rcx, %r15
; CHECK-NEXT: movq $-1, %rcx
; CHECK-NEXT: cmovaq %rcx, %rax
-; CHECK-NEXT: movq $-1, %r15
+; CHECK-NEXT: movq $-1, %r13
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r12, %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1209,9 +1209,9 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cmovbq %r12, %rax
; CHECK-NEXT: cmovbq %r14, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbp, %rdx
-; CHECK-NEXT: cmovaq %r15, %rax
-; CHECK-NEXT: movq $-1, %r15
+; CHECK-NEXT: cmovaq %r15, %rdx
+; CHECK-NEXT: cmovaq %r13, %rax
+; CHECK-NEXT: movq $-1, %r13
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r12, %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1228,9 +1228,9 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cmovbq %r12, %rax
; CHECK-NEXT: cmovbq %r14, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbp, %rdx
-; CHECK-NEXT: cmovaq %r15, %rax
-; CHECK-NEXT: movq $-1, %r15
+; CHECK-NEXT: cmovaq %r15, %rdx
+; CHECK-NEXT: cmovaq %r13, %rax
+; CHECK-NEXT: movq $-1, %r13
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r12, %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1247,10 +1247,8 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cmovbq %r12, %rax
; CHECK-NEXT: cmovbq %r14, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbp, %rdx
-; CHECK-NEXT: movq %rbp, %r13
-; CHECK-NEXT: cmovaq %r15, %rax
-; CHECK-NEXT: movq $-1, %r15
+; CHECK-NEXT: cmovaq %r15, %rdx
+; CHECK-NEXT: cmovaq %r13, %rax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r12, %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1267,9 +1265,11 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbq %r12, %rax
; CHECK-NEXT: cmovbq %r14, %rbp
+; CHECK-NEXT: movq %r14, %r13
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %r13, %rbp
-; CHECK-NEXT: cmovaq %r15, %rax
+; CHECK-NEXT: cmovaq %r15, %rbp
+; CHECK-NEXT: movq $-1, %rcx
+; CHECK-NEXT: cmovaq %rcx, %rax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpq %r12, %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1285,10 +1285,10 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbq %r12, %r14
-; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT: cmovbq %rax, %r15
+; CHECK-NEXT: cmovbq %r13, %r15
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %r13, %r15
+; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: cmovaq %rax, %r15
; CHECK-NEXT: movq $-1, %rax
; CHECK-NEXT: cmovaq %rax, %r14
; CHECK-NEXT: ucomiss %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
index 148d63f89d602..7c053d2dad508 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
@@ -485,22 +485,22 @@ define <2 x i128> @test_unsigned_v2i128_v2f64(<2 x double> %f) nounwind {
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $32, %rsp
; CHECK-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: callq __fixunsdfti at PLT
-; CHECK-NEXT: movq %rax, %r15
-; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rax, %r14
+; CHECK-NEXT: movq %rdx, %r15
; CHECK-NEXT: xorl %r12d, %r12d
; CHECK-NEXT: xorpd %xmm0, %xmm0
; CHECK-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-NEXT: ucomisd %xmm0, %xmm1
-; CHECK-NEXT: cmovbq %r12, %rbx
; CHECK-NEXT: cmovbq %r12, %r15
+; CHECK-NEXT: cmovbq %r12, %r14
; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-NEXT: movq $-1, %r13
+; CHECK-NEXT: cmovaq %r13, %r14
; CHECK-NEXT: cmovaq %r13, %r15
-; CHECK-NEXT: cmovaq %r13, %rbx
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __fixunsdfti at PLT
; CHECK-NEXT: movapd (%rsp), %xmm0 # 16-byte Reload
@@ -510,11 +510,11 @@ define <2 x i128> @test_unsigned_v2i128_v2f64(<2 x double> %f) nounwind {
; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovaq %r13, %rax
; CHECK-NEXT: cmovaq %r13, %rdx
-; CHECK-NEXT: movq %rdx, 8(%r14)
-; CHECK-NEXT: movq %rax, (%r14)
-; CHECK-NEXT: movq %rbx, 24(%r14)
-; CHECK-NEXT: movq %r15, 16(%r14)
-; CHECK-NEXT: movq %r14, %rax
+; CHECK-NEXT: movq %rdx, 8(%rbx)
+; CHECK-NEXT: movq %rax, (%rbx)
+; CHECK-NEXT: movq %r15, 24(%rbx)
+; CHECK-NEXT: movq %r14, 16(%rbx)
+; CHECK-NEXT: movq %rbx, %rax
; CHECK-NEXT: addq $32, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
@@ -657,87 +657,87 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: psrlq $48, %xmm0
; CHECK-NEXT: callq __extendhfsf2 at PLT
-; CHECK-NEXT: cvttss2si %xmm0, %ebp
-; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: cvttss2si %xmm0, %r15d
+; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
-; CHECK-NEXT: cmovbl %r14d, %ebp
+; CHECK-NEXT: cmovbl %ebx, %r15d
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: movl $255, %r15d
-; CHECK-NEXT: cmoval %r15d, %ebp
-; CHECK-NEXT: shll $8, %ebp
+; CHECK-NEXT: movl $255, %ebp
+; CHECK-NEXT: cmoval %ebp, %r15d
+; CHECK-NEXT: shll $8, %r15d
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r15d, %eax
-; CHECK-NEXT: movzbl %al, %ebx
-; CHECK-NEXT: orl %ebp, %ebx
+; CHECK-NEXT: cmoval %ebp, %eax
+; CHECK-NEXT: movzbl %al, %r14d
+; CHECK-NEXT: orl %r15d, %r14d
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r15d, %eax
-; CHECK-NEXT: movzbl %al, %ebp
+; CHECK-NEXT: cmoval %ebp, %eax
+; CHECK-NEXT: movzbl %al, %r15d
; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: psrld $16, %xmm0
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r15d, %eax
+; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: shll $8, %eax
-; CHECK-NEXT: orl %ebp, %eax
+; CHECK-NEXT: orl %r15d, %eax
; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: pinsrw $1, %ebx, %xmm0
+; CHECK-NEXT: pinsrw $1, %r14d, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: callq __extendhfsf2 at PLT
-; CHECK-NEXT: cvttss2si %xmm0, %ebx
+; CHECK-NEXT: cvttss2si %xmm0, %r14d
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %ebx
+; CHECK-NEXT: cmovbl %ebx, %r14d
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r15d, %ebx
-; CHECK-NEXT: shll $8, %ebx
+; CHECK-NEXT: cmoval %ebp, %r14d
+; CHECK-NEXT: shll $8, %r14d
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r15d, %eax
+; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: orl %ebx, %eax
+; CHECK-NEXT: orl %r14d, %eax
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: pinsrw $2, %eax, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: callq __extendhfsf2 at PLT
-; CHECK-NEXT: cvttss2si %xmm0, %ebx
+; CHECK-NEXT: cvttss2si %xmm0, %r14d
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %ebx
+; CHECK-NEXT: cmovbl %ebx, %r14d
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r15d, %ebx
-; CHECK-NEXT: shll $8, %ebx
+; CHECK-NEXT: cmoval %ebp, %r14d
+; CHECK-NEXT: shll $8, %r14d
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; CHECK-NEXT: callq __extendhfsf2 at PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbl %r14d, %eax
+; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmoval %r15d, %eax
+; CHECK-NEXT: cmoval %ebp, %eax
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: orl %ebx, %eax
+; CHECK-NEXT: orl %r14d, %eax
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: pinsrw $3, %eax, %xmm0
; CHECK-NEXT: addq $40, %rsp
@@ -983,13 +983,13 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: sarq $63, %rdx
; CHECK-NEXT: andq %rax, %rdx
; CHECK-NEXT: orq %rcx, %rdx
-; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
-; CHECK-NEXT: cmovbq %r14, %rdx
+; CHECK-NEXT: cmovbq %rbx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: movq $-1, %rbx
-; CHECK-NEXT: cmovaq %rbx, %rdx
+; CHECK-NEXT: movq $-1, %r14
+; CHECK-NEXT: cmovaq %r14, %rdx
; CHECK-NEXT: movq %rdx, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
@@ -1004,9 +1004,9 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: andq %rax, %rdx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rdx
+; CHECK-NEXT: cmovbq %rbx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rdx
+; CHECK-NEXT: cmovaq %r14, %rdx
; CHECK-NEXT: movq %rdx, %xmm0
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -1023,9 +1023,9 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: andq %rax, %rdx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rdx
+; CHECK-NEXT: cmovbq %rbx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rdx
+; CHECK-NEXT: cmovaq %r14, %rdx
; CHECK-NEXT: movq %rdx, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -1040,9 +1040,9 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: andq %rax, %rdx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rdx
+; CHECK-NEXT: cmovbq %rbx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rdx
+; CHECK-NEXT: cmovaq %r14, %rdx
; CHECK-NEXT: movq %rdx, %xmm0
; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
@@ -1059,9 +1059,9 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: andq %rax, %rdx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rdx
+; CHECK-NEXT: cmovbq %rbx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rdx
+; CHECK-NEXT: cmovaq %r14, %rdx
; CHECK-NEXT: movq %rdx, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -1076,9 +1076,9 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: andq %rax, %rdx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rdx
+; CHECK-NEXT: cmovbq %rbx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rdx
+; CHECK-NEXT: cmovaq %r14, %rdx
; CHECK-NEXT: movq %rdx, %xmm0
; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
@@ -1095,9 +1095,9 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: andq %rax, %rdx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rdx
+; CHECK-NEXT: cmovbq %rbx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rdx
+; CHECK-NEXT: cmovaq %r14, %rdx
; CHECK-NEXT: movq %rdx, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -1112,9 +1112,9 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: andq %rax, %rdx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovbq %r14, %rdx
+; CHECK-NEXT: cmovbq %rbx, %rdx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: cmovaq %rbx, %rdx
+; CHECK-NEXT: cmovaq %r14, %rdx
; CHECK-NEXT: movq %rdx, %xmm3
; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; CHECK-NEXT: # xmm3 = xmm3[0],mem[0]
diff --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll
index 6cb135e9c769b..4c4e4b2dae577 100644
--- a/llvm/test/CodeGen/X86/gather-addresses.ll
+++ b/llvm/test/CodeGen/X86/gather-addresses.ll
@@ -53,17 +53,17 @@ define <4 x double> @foo(ptr %p, ptr %i, ptr %h) nounwind {
; WIN-SSE2: # %bb.0:
; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0
; WIN-SSE2-NEXT: pand (%r8), %xmm0
-; WIN-SSE2-NEXT: movd %xmm0, %r8d
+; WIN-SSE2-NEXT: movd %xmm0, %eax
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; WIN-SSE2-NEXT: movd %xmm1, %r9d
+; WIN-SSE2-NEXT: movd %xmm1, %edx
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; WIN-SSE2-NEXT: movd %xmm1, %r10d
+; WIN-SSE2-NEXT: movd %xmm1, %r8d
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; WIN-SSE2-NEXT: movd %xmm0, %edx
-; WIN-SSE2-NEXT: movslq %r8d, %rax
-; WIN-SSE2-NEXT: movslq %r9d, %r8
-; WIN-SSE2-NEXT: movslq %r10d, %r9
+; WIN-SSE2-NEXT: movd %xmm0, %r9d
+; WIN-SSE2-NEXT: cltq
; WIN-SSE2-NEXT: movslq %edx, %rdx
+; WIN-SSE2-NEXT: movslq %r8d, %r8
+; WIN-SSE2-NEXT: movslq %r9d, %r9
; WIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; WIN-SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; WIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
@@ -185,18 +185,18 @@ define <4 x i64> @old(ptr %p, ptr %i, ptr %h, i64 %f) nounwind {
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; WIN-SSE2-NEXT: movd %xmm1, %ecx
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; WIN-SSE2-NEXT: movd %xmm1, %r8d
+; WIN-SSE2-NEXT: movd %xmm1, %edx
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; WIN-SSE2-NEXT: movd %xmm0, %edx
+; WIN-SSE2-NEXT: movd %xmm0, %r8d
; WIN-SSE2-NEXT: andl %r9d, %eax
; WIN-SSE2-NEXT: andl %r9d, %ecx
-; WIN-SSE2-NEXT: andl %r9d, %r8d
; WIN-SSE2-NEXT: andl %r9d, %edx
+; WIN-SSE2-NEXT: andl %r9d, %r8d
; WIN-SSE2-NEXT: movq %rax, %xmm0
; WIN-SSE2-NEXT: movq %rcx, %xmm1
; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; WIN-SSE2-NEXT: movq %rdx, %xmm2
-; WIN-SSE2-NEXT: movq %r8, %xmm1
+; WIN-SSE2-NEXT: movq %r8, %xmm2
+; WIN-SSE2-NEXT: movq %rdx, %xmm1
; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; WIN-SSE2-NEXT: retq
;
@@ -206,17 +206,17 @@ define <4 x i64> @old(ptr %p, ptr %i, ptr %h, i64 %f) nounwind {
; WIN-SSE4-NEXT: pand (%r8), %xmm0
; WIN-SSE4-NEXT: movd %xmm0, %eax
; WIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx
-; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d
-; WIN-SSE4-NEXT: pextrd $3, %xmm0, %edx
+; WIN-SSE4-NEXT: pextrd $2, %xmm0, %edx
+; WIN-SSE4-NEXT: pextrd $3, %xmm0, %r8d
; WIN-SSE4-NEXT: andl %r9d, %eax
; WIN-SSE4-NEXT: andl %r9d, %ecx
-; WIN-SSE4-NEXT: andl %r9d, %r8d
; WIN-SSE4-NEXT: andl %r9d, %edx
+; WIN-SSE4-NEXT: andl %r9d, %r8d
; WIN-SSE4-NEXT: movq %rcx, %xmm1
; WIN-SSE4-NEXT: movq %rax, %xmm0
; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; WIN-SSE4-NEXT: movq %rdx, %xmm2
-; WIN-SSE4-NEXT: movq %r8, %xmm1
+; WIN-SSE4-NEXT: movq %r8, %xmm2
+; WIN-SSE4-NEXT: movq %rdx, %xmm1
; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; WIN-SSE4-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/h-registers-1.ll b/llvm/test/CodeGen/X86/h-registers-1.ll
index 0ef9764b50273..07d85d260a37a 100644
--- a/llvm/test/CodeGen/X86/h-registers-1.ll
+++ b/llvm/test/CodeGen/X86/h-registers-1.ll
@@ -19,21 +19,20 @@ define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h)
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: movzbl %bh, %esi
-; CHECK-NEXT: movzbl %ah, %eax
-; CHECK-NEXT: movq %rax, %r10
+; CHECK-NEXT: movzbl %ah, %edi
; CHECK-NEXT: movzbl %dh, %edx
; CHECK-NEXT: movzbl %ch, %ebp
; CHECK-NEXT: movq %r8, %rax
; CHECK-NEXT: movzbl %ah, %ecx
; CHECK-NEXT: movq %r9, %rax
-; CHECK-NEXT: movzbl %ah, %edi
+; CHECK-NEXT: movzbl %ah, %ebx
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-NEXT: addq %r10, %rsi
+; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; CHECK-NEXT: addq %rdi, %rsi
; CHECK-NEXT: addq %rbp, %rdx
; CHECK-NEXT: addq %rsi, %rdx
-; CHECK-NEXT: addq %rdi, %rcx
-; CHECK-NEXT: addq %rbx, %rax
+; CHECK-NEXT: addq %rbx, %rcx
+; CHECK-NEXT: addq %r8, %rax
; CHECK-NEXT: addq %rcx, %rax
; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: popq %rbx
@@ -53,21 +52,20 @@ define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h)
; GNUX32-NEXT: movq %rsi, %rax
; GNUX32-NEXT: movq %rdi, %rbx
; GNUX32-NEXT: movzbl %bh, %esi
-; GNUX32-NEXT: movzbl %ah, %eax
-; GNUX32-NEXT: movq %rax, %r10
+; GNUX32-NEXT: movzbl %ah, %edi
; GNUX32-NEXT: movzbl %dh, %edx
; GNUX32-NEXT: movzbl %ch, %ebp
; GNUX32-NEXT: movq %r8, %rax
; GNUX32-NEXT: movzbl %ah, %ecx
; GNUX32-NEXT: movq %r9, %rax
-; GNUX32-NEXT: movzbl %ah, %edi
+; GNUX32-NEXT: movzbl %ah, %ebx
; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
-; GNUX32-NEXT: addq %r10, %rsi
+; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %r8d
+; GNUX32-NEXT: addq %rdi, %rsi
; GNUX32-NEXT: addq %rbp, %rdx
; GNUX32-NEXT: addq %rsi, %rdx
-; GNUX32-NEXT: addq %rdi, %rcx
-; GNUX32-NEXT: addq %rbx, %rax
+; GNUX32-NEXT: addq %rbx, %rcx
+; GNUX32-NEXT: addq %r8, %rax
; GNUX32-NEXT: addq %rcx, %rax
; GNUX32-NEXT: addq %rdx, %rax
; GNUX32-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll
index c022cd4e072b8..66e86f0e4a1a3 100644
--- a/llvm/test/CodeGen/X86/haddsub-2.ll
+++ b/llvm/test/CodeGen/X86/haddsub-2.ll
@@ -519,13 +519,13 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
; SSE3: # %bb.0:
; SSE3-NEXT: movd %xmm0, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; SSE3-NEXT: movd %xmm4, %r8d
-; SSE3-NEXT: addl %ecx, %r8d
+; SSE3-NEXT: movd %xmm4, %eax
+; SSE3-NEXT: addl %ecx, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE3-NEXT: movd %xmm4, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE3-NEXT: movd %xmm0, %r9d
-; SSE3-NEXT: addl %edx, %r9d
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: addl %edx, %ecx
; SSE3-NEXT: movd %xmm1, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE3-NEXT: movd %xmm0, %esi
@@ -535,36 +535,36 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
; SSE3-NEXT: movd %xmm0, %edi
; SSE3-NEXT: addl %edx, %edi
-; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: movd %xmm2, %r8d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
-; SSE3-NEXT: movd %xmm0, %r10d
-; SSE3-NEXT: addl %eax, %r10d
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: addl %r8d, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
-; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: movd %xmm0, %r8d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
-; SSE3-NEXT: movd %xmm0, %ecx
-; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movd %xmm3, %eax
+; SSE3-NEXT: movd %xmm0, %r9d
+; SSE3-NEXT: addl %r8d, %r9d
+; SSE3-NEXT: movd %xmm3, %r8d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
-; SSE3-NEXT: movd %xmm0, %edx
-; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: movd %xmm0, %r10d
+; SSE3-NEXT: addl %r8d, %r10d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
-; SSE3-NEXT: movd %xmm0, %r11d
+; SSE3-NEXT: movd %xmm0, %r8d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
-; SSE3-NEXT: movd %xmm0, %eax
-; SSE3-NEXT: addl %r11d, %eax
+; SSE3-NEXT: movd %xmm0, %r11d
+; SSE3-NEXT: addl %r8d, %r11d
; SSE3-NEXT: movd %edi, %xmm0
; SSE3-NEXT: movd %esi, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: movd %r9d, %xmm2
-; SSE3-NEXT: movd %r8d, %xmm0
+; SSE3-NEXT: movd %ecx, %xmm2
+; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE3-NEXT: movd %eax, %xmm1
-; SSE3-NEXT: movd %edx, %xmm2
+; SSE3-NEXT: movd %r11d, %xmm1
+; SSE3-NEXT: movd %r10d, %xmm2
; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE3-NEXT: movd %ecx, %xmm3
-; SSE3-NEXT: movd %r10d, %xmm1
+; SSE3-NEXT: movd %r9d, %xmm3
+; SSE3-NEXT: movd %edx, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE3-NEXT: retq
@@ -634,88 +634,88 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind {
; SSE3-NEXT: pushq %r13
; SSE3-NEXT: pushq %r12
; SSE3-NEXT: pushq %rbx
-; SSE3-NEXT: movd %xmm0, %eax
-; SSE3-NEXT: pextrw $1, %xmm0, %ecx
-; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE3-NEXT: pextrw $2, %xmm0, %eax
-; SSE3-NEXT: pextrw $3, %xmm0, %ecx
-; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE3-NEXT: pextrw $4, %xmm0, %eax
-; SSE3-NEXT: pextrw $5, %xmm0, %r11d
-; SSE3-NEXT: addl %eax, %r11d
-; SSE3-NEXT: pextrw $6, %xmm0, %eax
-; SSE3-NEXT: pextrw $7, %xmm0, %r15d
-; SSE3-NEXT: addl %eax, %r15d
-; SSE3-NEXT: movd %xmm1, %eax
-; SSE3-NEXT: pextrw $1, %xmm1, %r13d
-; SSE3-NEXT: addl %eax, %r13d
-; SSE3-NEXT: pextrw $2, %xmm1, %eax
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: pextrw $1, %xmm0, %eax
+; SSE3-NEXT: addl %ecx, %eax
+; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE3-NEXT: pextrw $2, %xmm0, %edx
+; SSE3-NEXT: pextrw $3, %xmm0, %eax
+; SSE3-NEXT: addl %edx, %eax
+; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE3-NEXT: pextrw $4, %xmm0, %edx
+; SSE3-NEXT: pextrw $5, %xmm0, %esi
+; SSE3-NEXT: addl %edx, %esi
+; SSE3-NEXT: pextrw $6, %xmm0, %edx
+; SSE3-NEXT: pextrw $7, %xmm0, %r8d
+; SSE3-NEXT: addl %edx, %r8d
+; SSE3-NEXT: movd %xmm1, %edx
+; SSE3-NEXT: pextrw $1, %xmm1, %r10d
+; SSE3-NEXT: addl %edx, %r10d
+; SSE3-NEXT: pextrw $2, %xmm1, %edx
; SSE3-NEXT: pextrw $3, %xmm1, %ebx
-; SSE3-NEXT: addl %eax, %ebx
-; SSE3-NEXT: pextrw $4, %xmm1, %eax
-; SSE3-NEXT: pextrw $5, %xmm1, %r8d
-; SSE3-NEXT: addl %eax, %r8d
-; SSE3-NEXT: pextrw $6, %xmm1, %eax
-; SSE3-NEXT: pextrw $7, %xmm1, %esi
-; SSE3-NEXT: addl %eax, %esi
-; SSE3-NEXT: movd %xmm2, %eax
-; SSE3-NEXT: pextrw $1, %xmm2, %r10d
-; SSE3-NEXT: addl %eax, %r10d
-; SSE3-NEXT: pextrw $2, %xmm2, %eax
-; SSE3-NEXT: pextrw $3, %xmm2, %r14d
-; SSE3-NEXT: addl %eax, %r14d
-; SSE3-NEXT: pextrw $4, %xmm2, %eax
-; SSE3-NEXT: pextrw $5, %xmm2, %r12d
-; SSE3-NEXT: addl %eax, %r12d
-; SSE3-NEXT: pextrw $6, %xmm2, %eax
-; SSE3-NEXT: pextrw $7, %xmm2, %r9d
-; SSE3-NEXT: addl %eax, %r9d
-; SSE3-NEXT: movd %xmm3, %eax
+; SSE3-NEXT: addl %edx, %ebx
+; SSE3-NEXT: pextrw $4, %xmm1, %edx
+; SSE3-NEXT: pextrw $5, %xmm1, %r14d
+; SSE3-NEXT: addl %edx, %r14d
+; SSE3-NEXT: pextrw $6, %xmm1, %edx
+; SSE3-NEXT: pextrw $7, %xmm1, %r12d
+; SSE3-NEXT: addl %edx, %r12d
+; SSE3-NEXT: movd %xmm2, %edi
+; SSE3-NEXT: pextrw $1, %xmm2, %edx
+; SSE3-NEXT: addl %edi, %edx
+; SSE3-NEXT: pextrw $2, %xmm2, %r9d
+; SSE3-NEXT: pextrw $3, %xmm2, %edi
+; SSE3-NEXT: addl %r9d, %edi
+; SSE3-NEXT: pextrw $4, %xmm2, %r11d
+; SSE3-NEXT: pextrw $5, %xmm2, %r9d
+; SSE3-NEXT: addl %r11d, %r9d
+; SSE3-NEXT: pextrw $6, %xmm2, %ebp
+; SSE3-NEXT: pextrw $7, %xmm2, %r11d
+; SSE3-NEXT: addl %ebp, %r11d
+; SSE3-NEXT: movd %xmm3, %r15d
; SSE3-NEXT: pextrw $1, %xmm3, %ebp
-; SSE3-NEXT: addl %eax, %ebp
-; SSE3-NEXT: pextrw $2, %xmm3, %edx
-; SSE3-NEXT: pextrw $3, %xmm3, %edi
-; SSE3-NEXT: addl %edx, %edi
-; SSE3-NEXT: pextrw $4, %xmm3, %edx
+; SSE3-NEXT: addl %r15d, %ebp
+; SSE3-NEXT: pextrw $2, %xmm3, %r13d
+; SSE3-NEXT: pextrw $3, %xmm3, %r15d
+; SSE3-NEXT: addl %r13d, %r15d
+; SSE3-NEXT: pextrw $4, %xmm3, %r13d
; SSE3-NEXT: pextrw $5, %xmm3, %ecx
-; SSE3-NEXT: addl %edx, %ecx
-; SSE3-NEXT: pextrw $6, %xmm3, %edx
+; SSE3-NEXT: addl %r13d, %ecx
+; SSE3-NEXT: pextrw $6, %xmm3, %r13d
; SSE3-NEXT: pextrw $7, %xmm3, %eax
-; SSE3-NEXT: addl %edx, %eax
-; SSE3-NEXT: movd %esi, %xmm8
-; SSE3-NEXT: movd %r8d, %xmm3
-; SSE3-NEXT: movd %ebx, %xmm9
-; SSE3-NEXT: movd %r13d, %xmm4
-; SSE3-NEXT: movd %r15d, %xmm10
-; SSE3-NEXT: movd %r11d, %xmm7
-; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload
-; SSE3-NEXT: # xmm11 = mem[0],zero,zero,zero
+; SSE3-NEXT: addl %r13d, %eax
+; SSE3-NEXT: movd %r12d, %xmm2
+; SSE3-NEXT: movd %r14d, %xmm3
+; SSE3-NEXT: movd %ebx, %xmm5
+; SSE3-NEXT: movd %r10d, %xmm4
+; SSE3-NEXT: movd %r8d, %xmm6
+; SSE3-NEXT: movd %esi, %xmm7
+; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload
+; SSE3-NEXT: # xmm8 = mem[0],zero,zero,zero
; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero
-; SSE3-NEXT: movd %eax, %xmm12
-; SSE3-NEXT: movd %ecx, %xmm6
-; SSE3-NEXT: movd %edi, %xmm13
-; SSE3-NEXT: movd %ebp, %xmm5
+; SSE3-NEXT: movd %eax, %xmm9
+; SSE3-NEXT: movd %ecx, %xmm10
+; SSE3-NEXT: movd %r15d, %xmm11
+; SSE3-NEXT: movd %ebp, %xmm12
+; SSE3-NEXT: movd %r11d, %xmm13
; SSE3-NEXT: movd %r9d, %xmm14
-; SSE3-NEXT: movd %r12d, %xmm2
-; SSE3-NEXT: movd %r14d, %xmm15
-; SSE3-NEXT: movd %r10d, %xmm1
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; SSE3-NEXT: movd %edi, %xmm15
+; SSE3-NEXT: movd %edx, %xmm1
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm12[0]
; SSE3-NEXT: popq %rbx
; SSE3-NEXT: popq %r12
; SSE3-NEXT: popq %r13
@@ -1133,13 +1133,13 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
; SSE3: # %bb.0:
; SSE3-NEXT: movd %xmm0, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; SSE3-NEXT: movd %xmm4, %r8d
-; SSE3-NEXT: addl %ecx, %r8d
+; SSE3-NEXT: movd %xmm4, %eax
+; SSE3-NEXT: addl %ecx, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; SSE3-NEXT: movd %xmm4, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE3-NEXT: movd %xmm0, %r9d
-; SSE3-NEXT: addl %edx, %r9d
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: addl %edx, %ecx
; SSE3-NEXT: movd %xmm2, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
; SSE3-NEXT: movd %xmm0, %esi
@@ -1149,36 +1149,36 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
; SSE3-NEXT: movd %xmm0, %edi
; SSE3-NEXT: addl %edx, %edi
-; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: movd %xmm1, %r8d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE3-NEXT: movd %xmm0, %r10d
-; SSE3-NEXT: addl %eax, %r10d
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: addl %r8d, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: movd %xmm0, %r8d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; SSE3-NEXT: movd %xmm0, %ecx
-; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movd %xmm3, %eax
+; SSE3-NEXT: movd %xmm0, %r9d
+; SSE3-NEXT: addl %r8d, %r9d
+; SSE3-NEXT: movd %xmm3, %r8d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
-; SSE3-NEXT: movd %xmm0, %edx
-; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: movd %xmm0, %r10d
+; SSE3-NEXT: addl %r8d, %r10d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
-; SSE3-NEXT: movd %xmm0, %r11d
+; SSE3-NEXT: movd %xmm0, %r8d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
-; SSE3-NEXT: movd %xmm0, %eax
-; SSE3-NEXT: addl %r11d, %eax
+; SSE3-NEXT: movd %xmm0, %r11d
+; SSE3-NEXT: addl %r8d, %r11d
; SSE3-NEXT: movd %edi, %xmm0
; SSE3-NEXT: movd %esi, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: movd %r9d, %xmm2
-; SSE3-NEXT: movd %r8d, %xmm0
+; SSE3-NEXT: movd %ecx, %xmm2
+; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE3-NEXT: movd %eax, %xmm1
-; SSE3-NEXT: movd %edx, %xmm2
+; SSE3-NEXT: movd %r11d, %xmm1
+; SSE3-NEXT: movd %r10d, %xmm2
; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE3-NEXT: movd %ecx, %xmm3
-; SSE3-NEXT: movd %r10d, %xmm1
+; SSE3-NEXT: movd %r9d, %xmm3
+; SSE3-NEXT: movd %edx, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE3-NEXT: retq
@@ -1247,87 +1247,87 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind {
; SSE3-NEXT: pushq %r12
; SSE3-NEXT: pushq %rbx
; SSE3-NEXT: movd %xmm0, %eax
-; SSE3-NEXT: pextrw $1, %xmm0, %r10d
-; SSE3-NEXT: addl %eax, %r10d
+; SSE3-NEXT: pextrw $1, %xmm0, %edx
+; SSE3-NEXT: addl %eax, %edx
; SSE3-NEXT: pextrw $2, %xmm0, %eax
-; SSE3-NEXT: pextrw $3, %xmm0, %r11d
-; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: pextrw $3, %xmm0, %esi
+; SSE3-NEXT: addl %eax, %esi
; SSE3-NEXT: pextrw $4, %xmm0, %eax
-; SSE3-NEXT: pextrw $5, %xmm0, %r12d
-; SSE3-NEXT: addl %eax, %r12d
+; SSE3-NEXT: pextrw $5, %xmm0, %r9d
+; SSE3-NEXT: addl %eax, %r9d
; SSE3-NEXT: pextrw $6, %xmm0, %eax
-; SSE3-NEXT: pextrw $7, %xmm0, %r13d
-; SSE3-NEXT: addl %eax, %r13d
-; SSE3-NEXT: movd %xmm1, %eax
-; SSE3-NEXT: pextrw $1, %xmm1, %ecx
-; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE3-NEXT: pextrw $2, %xmm1, %eax
-; SSE3-NEXT: pextrw $3, %xmm1, %ecx
-; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE3-NEXT: pextrw $4, %xmm1, %eax
-; SSE3-NEXT: pextrw $5, %xmm1, %r14d
-; SSE3-NEXT: addl %eax, %r14d
-; SSE3-NEXT: pextrw $6, %xmm1, %esi
-; SSE3-NEXT: pextrw $7, %xmm1, %r15d
-; SSE3-NEXT: addl %esi, %r15d
-; SSE3-NEXT: movd %xmm2, %esi
+; SSE3-NEXT: pextrw $7, %xmm0, %r10d
+; SSE3-NEXT: addl %eax, %r10d
+; SSE3-NEXT: movd %xmm1, %ecx
+; SSE3-NEXT: pextrw $1, %xmm1, %eax
+; SSE3-NEXT: addl %ecx, %eax
+; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE3-NEXT: pextrw $2, %xmm1, %edi
+; SSE3-NEXT: pextrw $3, %xmm1, %eax
+; SSE3-NEXT: addl %edi, %eax
+; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE3-NEXT: pextrw $4, %xmm1, %r8d
+; SSE3-NEXT: pextrw $5, %xmm1, %edi
+; SSE3-NEXT: addl %r8d, %edi
+; SSE3-NEXT: pextrw $6, %xmm1, %r11d
+; SSE3-NEXT: pextrw $7, %xmm1, %r8d
+; SSE3-NEXT: addl %r11d, %r8d
+; SSE3-NEXT: movd %xmm2, %r11d
; SSE3-NEXT: pextrw $1, %xmm2, %ebp
-; SSE3-NEXT: addl %esi, %ebp
-; SSE3-NEXT: pextrw $2, %xmm2, %esi
-; SSE3-NEXT: pextrw $3, %xmm2, %edi
-; SSE3-NEXT: addl %esi, %edi
-; SSE3-NEXT: pextrw $4, %xmm2, %esi
-; SSE3-NEXT: pextrw $5, %xmm2, %eax
-; SSE3-NEXT: addl %esi, %eax
-; SSE3-NEXT: pextrw $6, %xmm2, %esi
-; SSE3-NEXT: pextrw $7, %xmm2, %ecx
-; SSE3-NEXT: addl %esi, %ecx
+; SSE3-NEXT: addl %r11d, %ebp
+; SSE3-NEXT: pextrw $2, %xmm2, %r11d
+; SSE3-NEXT: pextrw $3, %xmm2, %r14d
+; SSE3-NEXT: addl %r11d, %r14d
+; SSE3-NEXT: pextrw $4, %xmm2, %r11d
+; SSE3-NEXT: pextrw $5, %xmm2, %r15d
+; SSE3-NEXT: addl %r11d, %r15d
+; SSE3-NEXT: pextrw $6, %xmm2, %r11d
+; SSE3-NEXT: pextrw $7, %xmm2, %r12d
+; SSE3-NEXT: addl %r11d, %r12d
; SSE3-NEXT: movd %xmm3, %ebx
-; SSE3-NEXT: pextrw $1, %xmm3, %r9d
-; SSE3-NEXT: addl %ebx, %r9d
-; SSE3-NEXT: pextrw $2, %xmm3, %edx
+; SSE3-NEXT: pextrw $1, %xmm3, %r11d
+; SSE3-NEXT: addl %ebx, %r11d
+; SSE3-NEXT: pextrw $2, %xmm3, %r13d
; SSE3-NEXT: pextrw $3, %xmm3, %ebx
-; SSE3-NEXT: addl %edx, %ebx
-; SSE3-NEXT: pextrw $4, %xmm3, %edx
-; SSE3-NEXT: pextrw $5, %xmm3, %esi
-; SSE3-NEXT: addl %edx, %esi
-; SSE3-NEXT: pextrw $6, %xmm3, %r8d
-; SSE3-NEXT: pextrw $7, %xmm3, %edx
-; SSE3-NEXT: addl %r8d, %edx
-; SSE3-NEXT: movd %ecx, %xmm8
-; SSE3-NEXT: movd %eax, %xmm3
-; SSE3-NEXT: movd %edi, %xmm9
+; SSE3-NEXT: addl %r13d, %ebx
+; SSE3-NEXT: pextrw $4, %xmm3, %r13d
+; SSE3-NEXT: pextrw $5, %xmm3, %ecx
+; SSE3-NEXT: addl %r13d, %ecx
+; SSE3-NEXT: pextrw $6, %xmm3, %r13d
+; SSE3-NEXT: pextrw $7, %xmm3, %eax
+; SSE3-NEXT: addl %r13d, %eax
+; SSE3-NEXT: movd %r12d, %xmm2
+; SSE3-NEXT: movd %r15d, %xmm3
+; SSE3-NEXT: movd %r14d, %xmm5
; SSE3-NEXT: movd %ebp, %xmm4
-; SSE3-NEXT: movd %r13d, %xmm10
-; SSE3-NEXT: movd %r12d, %xmm7
-; SSE3-NEXT: movd %r11d, %xmm11
-; SSE3-NEXT: movd %r10d, %xmm0
-; SSE3-NEXT: movd %edx, %xmm12
-; SSE3-NEXT: movd %esi, %xmm6
-; SSE3-NEXT: movd %ebx, %xmm13
-; SSE3-NEXT: movd %r9d, %xmm5
-; SSE3-NEXT: movd %r15d, %xmm14
-; SSE3-NEXT: movd %r14d, %xmm2
+; SSE3-NEXT: movd %r10d, %xmm6
+; SSE3-NEXT: movd %r9d, %xmm7
+; SSE3-NEXT: movd %esi, %xmm8
+; SSE3-NEXT: movd %edx, %xmm0
+; SSE3-NEXT: movd %eax, %xmm9
+; SSE3-NEXT: movd %ecx, %xmm10
+; SSE3-NEXT: movd %ebx, %xmm11
+; SSE3-NEXT: movd %r11d, %xmm12
+; SSE3-NEXT: movd %r8d, %xmm13
+; SSE3-NEXT: movd %edi, %xmm14
; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload
; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero
; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload
; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm12[0]
; SSE3-NEXT: popq %rbx
; SSE3-NEXT: popq %r12
; SSE3-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll
index 2250a9c78573b..29685b647f833 100644
--- a/llvm/test/CodeGen/X86/haddsub-4.ll
+++ b/llvm/test/CodeGen/X86/haddsub-4.ll
@@ -314,20 +314,19 @@ define <8 x double> @hadd_reverse2_v8f64(<8 x double> %a0, <8 x double> %a1) nou
define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
; SSE-LABEL: hadd_reverse_v16f32:
; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm4, %xmm8
-; SSE-NEXT: movaps %xmm0, %xmm4
+; SSE-NEXT: movaps %xmm0, %xmm8
; SSE-NEXT: haddps %xmm3, %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,2,1,0]
; SSE-NEXT: haddps %xmm7, %xmm6
; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
-; SSE-NEXT: haddps %xmm1, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0]
-; SSE-NEXT: haddps %xmm5, %xmm8
+; SSE-NEXT: haddps %xmm1, %xmm8
; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,2,1,0]
+; SSE-NEXT: haddps %xmm5, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0]
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: movaps %xmm6, %xmm1
-; SSE-NEXT: movaps %xmm4, %xmm2
-; SSE-NEXT: movaps %xmm8, %xmm3
+; SSE-NEXT: movaps %xmm8, %xmm2
+; SSE-NEXT: movaps %xmm4, %xmm3
; SSE-NEXT: retq
;
; AVX1-LABEL: hadd_reverse_v16f32:
diff --git a/llvm/test/CodeGen/X86/hoist-invariant-load.ll b/llvm/test/CodeGen/X86/hoist-invariant-load.ll
index 5611c2068e2b5..8687b64b7f593 100644
--- a/llvm/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/llvm/test/CodeGen/X86/hoist-invariant-load.ll
@@ -218,12 +218,12 @@ define void @test_multi_def(ptr dereferenceable(8) %x1,
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: xorl %r8d, %r8d
; CHECK-NEXT: movq (%rdi), %rdx
-; CHECK-NEXT: movq (%rsi), %r9
+; CHECK-NEXT: movq (%rsi), %rsi
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB4_2: ## %for.body
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mulxq %r9, %rsi, %rdi
-; CHECK-NEXT: addq %rsi, (%rax)
+; CHECK-NEXT: mulxq %rsi, %r9, %rdi
+; CHECK-NEXT: addq %r9, (%rax)
; CHECK-NEXT: adcq %rdi, 8(%rax)
; CHECK-NEXT: ## %bb.1: ## %for.check
; CHECK-NEXT: ## in Loop: Header=BB4_2 Depth=1
diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index 48ea7cc26ccdf..fd806e56af080 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -260,20 +260,19 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
; X64-NOBMI-NEXT: testq %rdi, %rdi
; X64-NOBMI-NEXT: je .LBB1_3
; X64-NOBMI-NEXT: # %bb.1: # %for.body.preheader
-; X64-NOBMI-NEXT: movq %rcx, %r8
-; X64-NOBMI-NEXT: movq %rdx, %r9
+; X64-NOBMI-NEXT: movq %rdx, %r8
; X64-NOBMI-NEXT: xorl %r10d, %r10d
-; X64-NOBMI-NEXT: xorl %ecx, %ecx
+; X64-NOBMI-NEXT: xorl %r9d, %r9d
; X64-NOBMI-NEXT: .p2align 4, 0x90
; X64-NOBMI-NEXT: .LBB1_2: # %for.body
; X64-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NOBMI-NEXT: movq %r8, %rax
-; X64-NOBMI-NEXT: mulq (%r9,%rcx,8)
+; X64-NOBMI-NEXT: movq %rcx, %rax
+; X64-NOBMI-NEXT: mulq (%r8,%r9,8)
; X64-NOBMI-NEXT: addq %r10, %rax
; X64-NOBMI-NEXT: adcq $0, %rdx
-; X64-NOBMI-NEXT: movq %rax, (%rsi,%rcx,8)
-; X64-NOBMI-NEXT: incq %rcx
-; X64-NOBMI-NEXT: cmpq %rcx, %rdi
+; X64-NOBMI-NEXT: movq %rax, (%rsi,%r9,8)
+; X64-NOBMI-NEXT: incq %r9
+; X64-NOBMI-NEXT: cmpq %r9, %rdi
; X64-NOBMI-NEXT: movq %rdx, %r10
; X64-NOBMI-NEXT: jne .LBB1_2
; X64-NOBMI-NEXT: .LBB1_3: # %for.end
@@ -285,21 +284,20 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
; X64-BMI-NEXT: testq %rdi, %rdi
; X64-BMI-NEXT: je .LBB1_3
; X64-BMI-NEXT: # %bb.1: # %for.body.preheader
-; X64-BMI-NEXT: movq %rcx, %r8
-; X64-BMI-NEXT: movq %rdx, %r9
-; X64-BMI-NEXT: xorl %r10d, %r10d
-; X64-BMI-NEXT: xorl %ecx, %ecx
+; X64-BMI-NEXT: movq %rdx, %rax
+; X64-BMI-NEXT: xorl %r9d, %r9d
+; X64-BMI-NEXT: xorl %r8d, %r8d
; X64-BMI-NEXT: .p2align 4, 0x90
; X64-BMI-NEXT: .LBB1_2: # %for.body
; X64-BMI-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-BMI-NEXT: movq %r8, %rdx
-; X64-BMI-NEXT: mulxq (%r9,%rcx,8), %rax, %rdx
-; X64-BMI-NEXT: addq %r10, %rax
+; X64-BMI-NEXT: movq %rcx, %rdx
+; X64-BMI-NEXT: mulxq (%rax,%r8,8), %r10, %rdx
+; X64-BMI-NEXT: addq %r9, %r10
; X64-BMI-NEXT: adcq $0, %rdx
-; X64-BMI-NEXT: movq %rax, (%rsi,%rcx,8)
-; X64-BMI-NEXT: incq %rcx
-; X64-BMI-NEXT: cmpq %rcx, %rdi
-; X64-BMI-NEXT: movq %rdx, %r10
+; X64-BMI-NEXT: movq %r10, (%rsi,%r8,8)
+; X64-BMI-NEXT: incq %r8
+; X64-BMI-NEXT: cmpq %r8, %rdi
+; X64-BMI-NEXT: movq %rdx, %r9
; X64-BMI-NEXT: jne .LBB1_2
; X64-BMI-NEXT: .LBB1_3: # %for.end
; X64-BMI-NEXT: xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/load-local-v3i1.ll b/llvm/test/CodeGen/X86/load-local-v3i1.ll
index ae3d9cdb326d3..52e0eb826d143 100644
--- a/llvm/test/CodeGen/X86/load-local-v3i1.ll
+++ b/llvm/test/CodeGen/X86/load-local-v3i1.ll
@@ -93,7 +93,7 @@ define void @local_load_v3i1(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr %p
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: movzbl (%rdx), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrb %cl
@@ -102,17 +102,17 @@ define void @local_load_v3i1(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr %p
; CHECK-NEXT: shrb $2, %dl
; CHECK-NEXT: andb $1, %al
; CHECK-NEXT: movzbl %al, %ebp
-; CHECK-NEXT: movzbl %dl, %r15d
-; CHECK-NEXT: movzbl %cl, %ebx
+; CHECK-NEXT: movzbl %dl, %r14d
+; CHECK-NEXT: movzbl %cl, %r15d
; CHECK-NEXT: movq %rsi, %rdi
; CHECK-NEXT: movl %ebp, %esi
-; CHECK-NEXT: movl %ebx, %edx
-; CHECK-NEXT: movl %r15d, %ecx
+; CHECK-NEXT: movl %r15d, %edx
+; CHECK-NEXT: movl %r14d, %ecx
; CHECK-NEXT: callq masked_load_v3 at PLT
-; CHECK-NEXT: movq %r14, %rdi
+; CHECK-NEXT: movq %rbx, %rdi
; CHECK-NEXT: movl %ebp, %esi
-; CHECK-NEXT: movl %ebx, %edx
-; CHECK-NEXT: movl %r15d, %ecx
+; CHECK-NEXT: movl %r15d, %edx
+; CHECK-NEXT: movl %r14d, %ecx
; CHECK-NEXT: callq masked_store4_v3 at PLT
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/lrshrink.ll b/llvm/test/CodeGen/X86/lrshrink.ll
index ea1f96b18f1b9..51f675d245190 100644
--- a/llvm/test/CodeGen/X86/lrshrink.ll
+++ b/llvm/test/CodeGen/X86/lrshrink.ll
@@ -16,28 +16,28 @@ define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) {
; CHECK-NEXT: .cfi_offset %rbx, -32
; CHECK-NEXT: .cfi_offset %r14, -24
; CHECK-NEXT: .cfi_offset %r15, -16
-; CHECK-NEXT: movq %rcx, %r14
-; CHECK-NEXT: movl $4, %r15d
+; CHECK-NEXT: movq %rcx, %rbx
+; CHECK-NEXT: movl $4, %r14d
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.1: # %then
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; CHECK-NEXT: movl $10, %r15d
+; CHECK-NEXT: movl $10, %r14d
; CHECK-NEXT: movq %rdx, %rsi
-; CHECK-NEXT: movq %r8, %r14
+; CHECK-NEXT: movq %r8, %rbx
; CHECK-NEXT: .LBB0_2: # %else
-; CHECK-NEXT: addq %r9, %r14
-; CHECK-NEXT: addq %rsi, %r15
-; CHECK-NEXT: callq _Z3foov at PLT
-; CHECK-NEXT: movl %eax, %ebx
-; CHECK-NEXT: addq %r15, %rbx
+; CHECK-NEXT: addq %r9, %rbx
+; CHECK-NEXT: addq %rsi, %r14
; CHECK-NEXT: callq _Z3foov at PLT
; CHECK-NEXT: movl %eax, %r15d
-; CHECK-NEXT: addq %rbx, %r15
+; CHECK-NEXT: addq %r14, %r15
+; CHECK-NEXT: callq _Z3foov at PLT
+; CHECK-NEXT: movl %eax, %r14d
+; CHECK-NEXT: addq %r15, %r14
; CHECK-NEXT: callq _Z3foov at PLT
; CHECK-NEXT: movl %eax, %eax
-; CHECK-NEXT: addq %r15, %rax
; CHECK-NEXT: addq %r14, %rax
+; CHECK-NEXT: addq %rbx, %rax
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: popq %r14
diff --git a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
index 3c35cb0138416..9f0c1ea1dc3f6 100644
--- a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -10,82 +10,84 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
; GENERIC-LABEL: t:
; GENERIC: ## %bb.0: ## %entry
; GENERIC-NEXT: pushq %rbp
+; GENERIC-NEXT: pushq %r15
; GENERIC-NEXT: pushq %r14
; GENERIC-NEXT: pushq %rbx
; GENERIC-NEXT: ## kill: def $ecx killed $ecx def $rcx
-; GENERIC-NEXT: movl (%rdx), %eax
+; GENERIC-NEXT: movl (%rdx), %r8d
; GENERIC-NEXT: movl 4(%rdx), %ebx
; GENERIC-NEXT: decl %ecx
-; GENERIC-NEXT: leaq 20(%rdx), %r11
-; GENERIC-NEXT: movq _Te0 at GOTPCREL(%rip), %r9
-; GENERIC-NEXT: movq _Te1 at GOTPCREL(%rip), %r8
+; GENERIC-NEXT: leaq 20(%rdx), %r9
+; GENERIC-NEXT: movq _Te0 at GOTPCREL(%rip), %rdi
+; GENERIC-NEXT: movq _Te1 at GOTPCREL(%rip), %rax
; GENERIC-NEXT: movq _Te3 at GOTPCREL(%rip), %r10
-; GENERIC-NEXT: movq %rcx, %r14
+; GENERIC-NEXT: movq %rcx, %r11
; GENERIC-NEXT: .p2align 4, 0x90
; GENERIC-NEXT: LBB0_1: ## %bb
; GENERIC-NEXT: ## =>This Inner Loop Header: Depth=1
-; GENERIC-NEXT: movzbl %al, %edi
-; GENERIC-NEXT: ## kill: def $eax killed $eax def $rax
-; GENERIC-NEXT: shrl $24, %eax
+; GENERIC-NEXT: movzbl %r8b, %r14d
+; GENERIC-NEXT: ## kill: def $r8d killed $r8d def $r8
+; GENERIC-NEXT: shrl $24, %r8d
; GENERIC-NEXT: movl %ebx, %ebp
; GENERIC-NEXT: shrl $16, %ebp
-; GENERIC-NEXT: movzbl %bpl, %ebp
-; GENERIC-NEXT: movl (%r8,%rbp,4), %ebp
-; GENERIC-NEXT: xorl (%r9,%rax,4), %ebp
-; GENERIC-NEXT: xorl -12(%r11), %ebp
+; GENERIC-NEXT: movzbl %bpl, %r15d
+; GENERIC-NEXT: movl (%rax,%r15,4), %ebp
+; GENERIC-NEXT: xorl (%rdi,%r8,4), %ebp
+; GENERIC-NEXT: xorl -12(%r9), %ebp
; GENERIC-NEXT: shrl $24, %ebx
-; GENERIC-NEXT: movl (%r10,%rdi,4), %edi
-; GENERIC-NEXT: xorl (%r9,%rbx,4), %edi
-; GENERIC-NEXT: xorl -8(%r11), %edi
-; GENERIC-NEXT: movl %ebp, %eax
-; GENERIC-NEXT: shrl $24, %eax
-; GENERIC-NEXT: movl (%r9,%rax,4), %eax
-; GENERIC-NEXT: subq $1, %r14
+; GENERIC-NEXT: movl (%r10,%r14,4), %r14d
+; GENERIC-NEXT: xorl (%rdi,%rbx,4), %r14d
+; GENERIC-NEXT: xorl -8(%r9), %r14d
+; GENERIC-NEXT: movl %ebp, %r8d
+; GENERIC-NEXT: shrl $24, %r8d
+; GENERIC-NEXT: movl (%rdi,%r8,4), %r8d
+; GENERIC-NEXT: subq $1, %r11
; GENERIC-NEXT: jb LBB0_3
; GENERIC-NEXT: ## %bb.2: ## %bb1
; GENERIC-NEXT: ## in Loop: Header=BB0_1 Depth=1
-; GENERIC-NEXT: movl %edi, %ebx
+; GENERIC-NEXT: movl %r14d, %ebx
; GENERIC-NEXT: shrl $16, %ebx
; GENERIC-NEXT: movzbl %bl, %ebx
-; GENERIC-NEXT: xorl (%r8,%rbx,4), %eax
-; GENERIC-NEXT: xorl -4(%r11), %eax
-; GENERIC-NEXT: shrl $24, %edi
+; GENERIC-NEXT: xorl (%rax,%rbx,4), %r8d
+; GENERIC-NEXT: xorl -4(%r9), %r8d
+; GENERIC-NEXT: shrl $24, %r14d
; GENERIC-NEXT: movzbl %bpl, %ebx
; GENERIC-NEXT: movl (%r10,%rbx,4), %ebx
-; GENERIC-NEXT: xorl (%r9,%rdi,4), %ebx
-; GENERIC-NEXT: xorl (%r11), %ebx
-; GENERIC-NEXT: addq $16, %r11
+; GENERIC-NEXT: xorl (%rdi,%r14,4), %ebx
+; GENERIC-NEXT: xorl (%r9), %ebx
+; GENERIC-NEXT: addq $16, %r9
; GENERIC-NEXT: jmp LBB0_1
; GENERIC-NEXT: LBB0_3: ## %bb2
; GENERIC-NEXT: shlq $4, %rcx
-; GENERIC-NEXT: andl $-16777216, %eax ## imm = 0xFF000000
-; GENERIC-NEXT: movl %edi, %ebx
-; GENERIC-NEXT: shrl $16, %ebx
-; GENERIC-NEXT: movzbl %bl, %ebx
-; GENERIC-NEXT: movzbl 2(%r8,%rbx,4), %ebx
-; GENERIC-NEXT: shll $16, %ebx
-; GENERIC-NEXT: orl %eax, %ebx
-; GENERIC-NEXT: xorl 16(%rcx,%rdx), %ebx
-; GENERIC-NEXT: shrl $8, %edi
-; GENERIC-NEXT: movzbl 3(%r9,%rdi,4), %eax
-; GENERIC-NEXT: shll $24, %eax
-; GENERIC-NEXT: movzbl %bpl, %edi
-; GENERIC-NEXT: movzbl 2(%r8,%rdi,4), %edi
-; GENERIC-NEXT: shll $16, %edi
-; GENERIC-NEXT: orl %eax, %edi
-; GENERIC-NEXT: xorl 20(%rcx,%rdx), %edi
-; GENERIC-NEXT: movl %ebx, %eax
-; GENERIC-NEXT: shrl $24, %eax
-; GENERIC-NEXT: movb %al, (%rsi)
-; GENERIC-NEXT: shrl $16, %ebx
-; GENERIC-NEXT: movb %bl, 1(%rsi)
-; GENERIC-NEXT: movl %edi, %eax
-; GENERIC-NEXT: shrl $24, %eax
-; GENERIC-NEXT: movb %al, 4(%rsi)
-; GENERIC-NEXT: shrl $16, %edi
-; GENERIC-NEXT: movb %dil, 5(%rsi)
+; GENERIC-NEXT: andl $-16777216, %r8d ## imm = 0xFF000000
+; GENERIC-NEXT: movl %r14d, %r9d
+; GENERIC-NEXT: shrl $16, %r9d
+; GENERIC-NEXT: movzbl %r9b, %r9d
+; GENERIC-NEXT: movzbl 2(%rax,%r9,4), %r9d
+; GENERIC-NEXT: shll $16, %r9d
+; GENERIC-NEXT: orl %r8d, %r9d
+; GENERIC-NEXT: xorl 16(%rcx,%rdx), %r9d
+; GENERIC-NEXT: shrl $8, %r14d
+; GENERIC-NEXT: movzbl 3(%rdi,%r14,4), %edi
+; GENERIC-NEXT: shll $24, %edi
+; GENERIC-NEXT: movzbl %bpl, %r8d
+; GENERIC-NEXT: movzbl 2(%rax,%r8,4), %eax
+; GENERIC-NEXT: shll $16, %eax
+; GENERIC-NEXT: orl %edi, %eax
+; GENERIC-NEXT: xorl 20(%rcx,%rdx), %eax
+; GENERIC-NEXT: movl %r9d, %ecx
+; GENERIC-NEXT: shrl $24, %ecx
+; GENERIC-NEXT: movb %cl, (%rsi)
+; GENERIC-NEXT: shrl $16, %r9d
+; GENERIC-NEXT: movb %r9b, 1(%rsi)
+; GENERIC-NEXT: movl %eax, %ecx
+; GENERIC-NEXT: shrl $24, %ecx
+; GENERIC-NEXT: movb %cl, 4(%rsi)
+; GENERIC-NEXT: shrl $16, %eax
+; GENERIC-NEXT: movb %al, 5(%rsi)
; GENERIC-NEXT: popq %rbx
; GENERIC-NEXT: popq %r14
+; GENERIC-NEXT: popq %r15
; GENERIC-NEXT: popq %rbp
; GENERIC-NEXT: retq
;
@@ -96,76 +98,77 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
; ATOM-NEXT: pushq %r14
; ATOM-NEXT: pushq %rbx
; ATOM-NEXT: ## kill: def $ecx killed $ecx def $rcx
-; ATOM-NEXT: movl (%rdx), %r15d
-; ATOM-NEXT: movl 4(%rdx), %eax
-; ATOM-NEXT: leaq 20(%rdx), %r11
-; ATOM-NEXT: movq _Te0 at GOTPCREL(%rip), %r9
-; ATOM-NEXT: movq _Te1 at GOTPCREL(%rip), %r8
+; ATOM-NEXT: movl (%rdx), %r8d
+; ATOM-NEXT: movl 4(%rdx), %r15d
+; ATOM-NEXT: leaq 20(%rdx), %r9
+; ATOM-NEXT: movq _Te0 at GOTPCREL(%rip), %rdi
+; ATOM-NEXT: movq _Te1 at GOTPCREL(%rip), %rax
; ATOM-NEXT: movq _Te3 at GOTPCREL(%rip), %r10
; ATOM-NEXT: decl %ecx
-; ATOM-NEXT: movq %rcx, %r14
+; ATOM-NEXT: movq %rcx, %r11
; ATOM-NEXT: .p2align 4, 0x90
; ATOM-NEXT: LBB0_1: ## %bb
; ATOM-NEXT: ## =>This Inner Loop Header: Depth=1
-; ATOM-NEXT: movl %eax, %edi
-; ATOM-NEXT: movl %r15d, %ebp
-; ATOM-NEXT: shrl $24, %eax
-; ATOM-NEXT: shrl $16, %edi
-; ATOM-NEXT: shrl $24, %ebp
-; ATOM-NEXT: movzbl %dil, %edi
-; ATOM-NEXT: movl (%r8,%rdi,4), %ebx
-; ATOM-NEXT: movzbl %r15b, %edi
-; ATOM-NEXT: xorl (%r9,%rbp,4), %ebx
-; ATOM-NEXT: movl (%r10,%rdi,4), %edi
-; ATOM-NEXT: xorl -12(%r11), %ebx
-; ATOM-NEXT: xorl (%r9,%rax,4), %edi
-; ATOM-NEXT: movl %ebx, %eax
-; ATOM-NEXT: xorl -8(%r11), %edi
-; ATOM-NEXT: shrl $24, %eax
-; ATOM-NEXT: movl (%r9,%rax,4), %r15d
-; ATOM-NEXT: subq $1, %r14
-; ATOM-NEXT: movl %edi, %eax
+; ATOM-NEXT: movl %r15d, %ebx
+; ATOM-NEXT: movl %r8d, %r14d
+; ATOM-NEXT: movzbl %r8b, %r8d
+; ATOM-NEXT: shrl $24, %r15d
+; ATOM-NEXT: shrl $16, %ebx
+; ATOM-NEXT: shrl $24, %r14d
+; ATOM-NEXT: movzbl %bl, %ebx
+; ATOM-NEXT: movl (%rax,%rbx,4), %ebx
+; ATOM-NEXT: xorl (%rdi,%r14,4), %ebx
+; ATOM-NEXT: movl (%r10,%r8,4), %r14d
+; ATOM-NEXT: xorl -12(%r9), %ebx
+; ATOM-NEXT: xorl (%rdi,%r15,4), %r14d
+; ATOM-NEXT: movl %ebx, %r8d
+; ATOM-NEXT: xorl -8(%r9), %r14d
+; ATOM-NEXT: shrl $24, %r8d
+; ATOM-NEXT: subq $1, %r11
+; ATOM-NEXT: movl (%rdi,%r8,4), %r8d
; ATOM-NEXT: jb LBB0_3
; ATOM-NEXT: ## %bb.2: ## %bb1
; ATOM-NEXT: ## in Loop: Header=BB0_1 Depth=1
-; ATOM-NEXT: shrl $16, %eax
-; ATOM-NEXT: shrl $24, %edi
-; ATOM-NEXT: movzbl %al, %eax
-; ATOM-NEXT: xorl (%r8,%rax,4), %r15d
-; ATOM-NEXT: movzbl %bl, %eax
-; ATOM-NEXT: movl (%r10,%rax,4), %eax
-; ATOM-NEXT: xorl -4(%r11), %r15d
-; ATOM-NEXT: xorl (%r9,%rdi,4), %eax
-; ATOM-NEXT: xorl (%r11), %eax
-; ATOM-NEXT: addq $16, %r11
+; ATOM-NEXT: movl %r14d, %ebp
+; ATOM-NEXT: movzbl %bl, %ebx
+; ATOM-NEXT: shrl $24, %r14d
+; ATOM-NEXT: shrl $16, %ebp
+; ATOM-NEXT: movzbl %bpl, %r15d
+; ATOM-NEXT: xorl (%rax,%r15,4), %r8d
+; ATOM-NEXT: movl (%r10,%rbx,4), %r15d
+; ATOM-NEXT: xorl (%rdi,%r14,4), %r15d
+; ATOM-NEXT: xorl -4(%r9), %r8d
+; ATOM-NEXT: xorl (%r9), %r15d
+; ATOM-NEXT: addq $16, %r9
; ATOM-NEXT: jmp LBB0_1
; ATOM-NEXT: LBB0_3: ## %bb2
-; ATOM-NEXT: shrl $16, %eax
-; ATOM-NEXT: shrl $8, %edi
-; ATOM-NEXT: movzbl %bl, %ebp
-; ATOM-NEXT: andl $-16777216, %r15d ## imm = 0xFF000000
+; ATOM-NEXT: movl %r14d, %r9d
+; ATOM-NEXT: andl $-16777216, %r8d ## imm = 0xFF000000
+; ATOM-NEXT: shrl $8, %r14d
; ATOM-NEXT: shlq $4, %rcx
-; ATOM-NEXT: movzbl %al, %eax
-; ATOM-NEXT: movzbl 3(%r9,%rdi,4), %edi
-; ATOM-NEXT: movzbl 2(%r8,%rbp,4), %ebp
-; ATOM-NEXT: movzbl 2(%r8,%rax,4), %eax
+; ATOM-NEXT: shrl $16, %r9d
+; ATOM-NEXT: movzbl 3(%rdi,%r14,4), %edi
+; ATOM-NEXT: movzbl %r9b, %r9d
; ATOM-NEXT: shll $24, %edi
-; ATOM-NEXT: shll $16, %ebp
+; ATOM-NEXT: movzbl 2(%rax,%r9,4), %r9d
+; ATOM-NEXT: shll $16, %r9d
+; ATOM-NEXT: orl %r8d, %r9d
+; ATOM-NEXT: movzbl %bl, %r8d
+; ATOM-NEXT: movzbl 2(%rax,%r8,4), %eax
+; ATOM-NEXT: xorl 16(%rcx,%rdx), %r9d
; ATOM-NEXT: shll $16, %eax
-; ATOM-NEXT: orl %edi, %ebp
-; ATOM-NEXT: orl %r15d, %eax
-; ATOM-NEXT: xorl 20(%rcx,%rdx), %ebp
-; ATOM-NEXT: xorl 16(%rcx,%rdx), %eax
-; ATOM-NEXT: movl %eax, %edi
-; ATOM-NEXT: shrl $16, %eax
+; ATOM-NEXT: orl %edi, %eax
+; ATOM-NEXT: movl %r9d, %edi
+; ATOM-NEXT: shrl $16, %r9d
+; ATOM-NEXT: xorl 20(%rcx,%rdx), %eax
; ATOM-NEXT: shrl $24, %edi
+; ATOM-NEXT: movl %eax, %ecx
+; ATOM-NEXT: shrl $16, %eax
; ATOM-NEXT: movb %dil, (%rsi)
-; ATOM-NEXT: movb %al, 1(%rsi)
-; ATOM-NEXT: movl %ebp, %eax
-; ATOM-NEXT: shrl $16, %ebp
-; ATOM-NEXT: shrl $24, %eax
-; ATOM-NEXT: movb %al, 4(%rsi)
-; ATOM-NEXT: movb %bpl, 5(%rsi)
+; ATOM-NEXT: movb %r9b, 1(%rsi)
+; ATOM-NEXT: shrl $24, %ecx
+; ATOM-NEXT: movb %cl, 4(%rsi)
+; ATOM-NEXT: movb %al, 5(%rsi)
; ATOM-NEXT: popq %rbx
; ATOM-NEXT: popq %r14
; ATOM-NEXT: popq %r15
diff --git a/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll
index 0662a5c521ab6..d9d5e2846ed0f 100644
--- a/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll
+++ b/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll
@@ -236,16 +236,16 @@ define i32 @test_zext_cmp8(i32 %a, i32 %b, i32 %c, i32 %d) {
; NOFASTLZCNT-LABEL: test_zext_cmp8:
; NOFASTLZCNT: # %bb.0: # %entry
; NOFASTLZCNT-NEXT: testl %edi, %edi
-; NOFASTLZCNT-NEXT: sete %dil
-; NOFASTLZCNT-NEXT: testl %esi, %esi
; NOFASTLZCNT-NEXT: sete %al
-; NOFASTLZCNT-NEXT: orb %dil, %al
+; NOFASTLZCNT-NEXT: testl %esi, %esi
+; NOFASTLZCNT-NEXT: sete %sil
+; NOFASTLZCNT-NEXT: orb %al, %sil
; NOFASTLZCNT-NEXT: testl %edx, %edx
-; NOFASTLZCNT-NEXT: sete %dl
+; NOFASTLZCNT-NEXT: sete %al
; NOFASTLZCNT-NEXT: testl %ecx, %ecx
; NOFASTLZCNT-NEXT: sete %cl
-; NOFASTLZCNT-NEXT: orb %dl, %cl
; NOFASTLZCNT-NEXT: orb %al, %cl
+; NOFASTLZCNT-NEXT: orb %sil, %cl
; NOFASTLZCNT-NEXT: movzbl %cl, %eax
; NOFASTLZCNT-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
index 68b54fd9a83fd..f02b4d1c7726d 100644
--- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
+++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
@@ -1012,63 +1012,63 @@ define <4 x i64> @reassociate_umax_v4i64(<4 x i64> %x0, <4 x i64> %x1, <4 x i64>
; SSE: # %bb.0:
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: paddq %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSE-NEXT: movdqa %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm8, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm8, %xmm2
+; SSE-NEXT: pxor %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: pxor %xmm2, %xmm8
; SSE-NEXT: movdqa %xmm3, %xmm9
-; SSE-NEXT: pcmpgtd %xmm2, %xmm9
+; SSE-NEXT: pcmpgtd %xmm8, %xmm9
; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pand %xmm10, %xmm2
+; SSE-NEXT: pcmpeqd %xmm3, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE-NEXT: pand %xmm10, %xmm8
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
-; SSE-NEXT: por %xmm2, %xmm3
+; SSE-NEXT: por %xmm8, %xmm3
; SSE-NEXT: pand %xmm3, %xmm5
; SSE-NEXT: pandn %xmm1, %xmm3
; SSE-NEXT: por %xmm5, %xmm3
; SSE-NEXT: movdqa %xmm4, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm8, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pxor %xmm2, %xmm5
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: pcmpgtd %xmm5, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
; SSE-NEXT: pand %xmm9, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm8, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSE-NEXT: por %xmm1, %xmm5
+; SSE-NEXT: pand %xmm5, %xmm4
+; SSE-NEXT: pandn %xmm0, %xmm5
+; SSE-NEXT: por %xmm4, %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm6, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: pandn %xmm2, %xmm0
+; SSE-NEXT: pandn %xmm5, %xmm0
; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
-; SSE-NEXT: pxor %xmm7, %xmm8
-; SSE-NEXT: movdqa %xmm8, %xmm2
-; SSE-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm1, %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSE-NEXT: pand %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: pxor %xmm7, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pand %xmm5, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: pand %xmm1, %xmm7
; SSE-NEXT: pandn %xmm3, %xmm1
; SSE-NEXT: por %xmm7, %xmm1
@@ -1218,63 +1218,63 @@ define <4 x i64> @reassociate_smax_v4i64(<4 x i64> %x0, <4 x i64> %x1, <4 x i64>
; SSE: # %bb.0:
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: paddq %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE-NEXT: movdqa %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm8, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm8, %xmm2
+; SSE-NEXT: pxor %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: pxor %xmm2, %xmm8
; SSE-NEXT: movdqa %xmm3, %xmm9
-; SSE-NEXT: pcmpgtd %xmm2, %xmm9
+; SSE-NEXT: pcmpgtd %xmm8, %xmm9
; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pand %xmm10, %xmm2
+; SSE-NEXT: pcmpeqd %xmm3, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE-NEXT: pand %xmm10, %xmm8
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
-; SSE-NEXT: por %xmm2, %xmm3
+; SSE-NEXT: por %xmm8, %xmm3
; SSE-NEXT: pand %xmm3, %xmm5
; SSE-NEXT: pandn %xmm1, %xmm3
; SSE-NEXT: por %xmm5, %xmm3
; SSE-NEXT: movdqa %xmm4, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm8, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pxor %xmm2, %xmm5
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: pcmpgtd %xmm5, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
; SSE-NEXT: pand %xmm9, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm8, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSE-NEXT: por %xmm1, %xmm5
+; SSE-NEXT: pand %xmm5, %xmm4
+; SSE-NEXT: pandn %xmm0, %xmm5
+; SSE-NEXT: por %xmm4, %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm6, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: pandn %xmm2, %xmm0
+; SSE-NEXT: pandn %xmm5, %xmm0
; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
-; SSE-NEXT: pxor %xmm7, %xmm8
-; SSE-NEXT: movdqa %xmm8, %xmm2
-; SSE-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm1, %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSE-NEXT: pand %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: pxor %xmm7, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pand %xmm5, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: pand %xmm1, %xmm7
; SSE-NEXT: pandn %xmm3, %xmm1
; SSE-NEXT: por %xmm7, %xmm1
@@ -1425,63 +1425,63 @@ define <4 x i64> @reassociate_umin_v4i64(<4 x i64> %x0, <4 x i64> %x1, <4 x i64>
; SSE: # %bb.0:
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: paddq %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSE-NEXT: movdqa %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm8, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm8, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: pxor %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: pxor %xmm2, %xmm8
+; SSE-NEXT: movdqa %xmm8, %xmm9
; SSE-NEXT: pcmpgtd %xmm3, %xmm9
; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pand %xmm10, %xmm2
+; SSE-NEXT: pcmpeqd %xmm3, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE-NEXT: pand %xmm10, %xmm8
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
-; SSE-NEXT: por %xmm2, %xmm3
+; SSE-NEXT: por %xmm8, %xmm3
; SSE-NEXT: pand %xmm3, %xmm5
; SSE-NEXT: pandn %xmm1, %xmm3
; SSE-NEXT: por %xmm5, %xmm3
; SSE-NEXT: movdqa %xmm4, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm8, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pxor %xmm2, %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm8
+; SSE-NEXT: pcmpgtd %xmm1, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
; SSE-NEXT: pand %xmm9, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm8, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSE-NEXT: por %xmm1, %xmm5
+; SSE-NEXT: pand %xmm5, %xmm4
+; SSE-NEXT: pandn %xmm0, %xmm5
+; SSE-NEXT: por %xmm4, %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm6, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm4
; SSE-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: pandn %xmm2, %xmm0
+; SSE-NEXT: pandn %xmm5, %xmm0
; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
-; SSE-NEXT: pxor %xmm7, %xmm8
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pcmpgtd %xmm8, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm1, %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSE-NEXT: pand %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: pxor %xmm7, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pand %xmm5, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: pand %xmm1, %xmm7
; SSE-NEXT: pandn %xmm3, %xmm1
; SSE-NEXT: por %xmm7, %xmm1
@@ -1631,63 +1631,63 @@ define <4 x i64> @reassociate_smin_v4i64(<4 x i64> %x0, <4 x i64> %x1, <4 x i64>
; SSE: # %bb.0:
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: paddq %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE-NEXT: movdqa %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm8, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm8, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: pxor %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: pxor %xmm2, %xmm8
+; SSE-NEXT: movdqa %xmm8, %xmm9
; SSE-NEXT: pcmpgtd %xmm3, %xmm9
; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pand %xmm10, %xmm2
+; SSE-NEXT: pcmpeqd %xmm3, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE-NEXT: pand %xmm10, %xmm8
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
-; SSE-NEXT: por %xmm2, %xmm3
+; SSE-NEXT: por %xmm8, %xmm3
; SSE-NEXT: pand %xmm3, %xmm5
; SSE-NEXT: pandn %xmm1, %xmm3
; SSE-NEXT: por %xmm5, %xmm3
; SSE-NEXT: movdqa %xmm4, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm8, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pxor %xmm2, %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm8
+; SSE-NEXT: pcmpgtd %xmm1, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
; SSE-NEXT: pand %xmm9, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm8, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSE-NEXT: por %xmm1, %xmm5
+; SSE-NEXT: pand %xmm5, %xmm4
+; SSE-NEXT: pandn %xmm0, %xmm5
+; SSE-NEXT: por %xmm4, %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm6, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm4
; SSE-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: pandn %xmm2, %xmm0
+; SSE-NEXT: pandn %xmm5, %xmm0
; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm8, %xmm1
-; SSE-NEXT: pxor %xmm7, %xmm8
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pcmpgtd %xmm8, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm1, %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSE-NEXT: pand %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: pxor %xmm7, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pand %xmm5, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: pand %xmm1, %xmm7
; SSE-NEXT: pandn %xmm3, %xmm1
; SSE-NEXT: por %xmm7, %xmm1
@@ -1947,13 +1947,13 @@ define <8 x i64> @reassociate_umax_v8i64(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm5, %xmm6
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3]
-; SSE-NEXT: por %xmm6, %xmm9
-; SSE-NEXT: pand %xmm9, %xmm8
-; SSE-NEXT: pandn %xmm3, %xmm9
-; SSE-NEXT: por %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm5
+; SSE-NEXT: pand %xmm5, %xmm8
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: por %xmm8, %xmm5
+; SSE-NEXT: movdqa %xmm9, %xmm3
; SSE-NEXT: pxor %xmm4, %xmm3
; SSE-NEXT: movdqa %xmm2, %xmm6
; SSE-NEXT: pxor %xmm4, %xmm6
@@ -1963,106 +1963,106 @@ define <8 x i64> @reassociate_umax_v8i64(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm3, %xmm6
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSE-NEXT: por %xmm6, %xmm8
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pandn %xmm2, %xmm8
-; SSE-NEXT: por %xmm5, %xmm8
-; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm3
+; SSE-NEXT: pand %xmm3, %xmm9
+; SSE-NEXT: pandn %xmm2, %xmm3
+; SSE-NEXT: por %xmm9, %xmm3
+; SSE-NEXT: movdqa %xmm8, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: pand %xmm2, %xmm5
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE-NEXT: por %xmm5, %xmm2
-; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm2, %xmm7
+; SSE-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm2, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: pand %xmm2, %xmm8
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm7, %xmm1
+; SSE-NEXT: por %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm9, %xmm1
; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm1, %xmm7
+; SSE-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm6
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm5
-; SSE-NEXT: pand %xmm5, %xmm7
-; SSE-NEXT: pandn %xmm0, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm7
+; SSE-NEXT: pand %xmm7, %xmm9
+; SSE-NEXT: pandn %xmm0, %xmm7
+; SSE-NEXT: por %xmm9, %xmm7
+; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm6
-; SSE-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm8
+; SSE-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm0, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm7, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: pand %xmm1, %xmm7
+; SSE-NEXT: movdqa %xmm9, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm1
+; SSE-NEXT: pand %xmm1, %xmm9
; SSE-NEXT: pandn %xmm2, %xmm1
-; SSE-NEXT: por %xmm7, %xmm1
-; SSE-NEXT: movdqa %xmm8, %xmm2
+; SSE-NEXT: por %xmm9, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm6, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: movdqa %xmm8, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: pcmpgtd %xmm2, %xmm7
+; SSE-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: pandn %xmm8, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm9, %xmm3
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: pand %xmm2, %xmm8
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm5, %xmm3
; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm6
-; SSE-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE-NEXT: pxor %xmm6, %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm7
+; SSE-NEXT: pcmpgtd %xmm3, %xmm7
; SSE-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE-NEXT: pand %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: pand %xmm3, %xmm5
-; SSE-NEXT: pandn %xmm9, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
+; SSE-NEXT: pand %xmm3, %xmm6
+; SSE-NEXT: pandn %xmm5, %xmm3
+; SSE-NEXT: por %xmm6, %xmm3
; SSE-NEXT: retq
;
; AVX2-LABEL: reassociate_umax_v8i64:
@@ -2328,13 +2328,13 @@ define <8 x i64> @reassociate_smax_v8i64(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm5, %xmm6
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3]
-; SSE-NEXT: por %xmm6, %xmm9
-; SSE-NEXT: pand %xmm9, %xmm8
-; SSE-NEXT: pandn %xmm3, %xmm9
-; SSE-NEXT: por %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm5
+; SSE-NEXT: pand %xmm5, %xmm8
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: por %xmm8, %xmm5
+; SSE-NEXT: movdqa %xmm9, %xmm3
; SSE-NEXT: pxor %xmm4, %xmm3
; SSE-NEXT: movdqa %xmm2, %xmm6
; SSE-NEXT: pxor %xmm4, %xmm6
@@ -2344,106 +2344,106 @@ define <8 x i64> @reassociate_smax_v8i64(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm3, %xmm6
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSE-NEXT: por %xmm6, %xmm8
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pandn %xmm2, %xmm8
-; SSE-NEXT: por %xmm5, %xmm8
-; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm3
+; SSE-NEXT: pand %xmm3, %xmm9
+; SSE-NEXT: pandn %xmm2, %xmm3
+; SSE-NEXT: por %xmm9, %xmm3
+; SSE-NEXT: movdqa %xmm8, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: pand %xmm2, %xmm5
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE-NEXT: por %xmm5, %xmm2
-; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm2, %xmm7
+; SSE-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm2, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: pand %xmm2, %xmm8
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm7, %xmm1
+; SSE-NEXT: por %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm9, %xmm1
; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm1, %xmm7
+; SSE-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm6
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm5
-; SSE-NEXT: pand %xmm5, %xmm7
-; SSE-NEXT: pandn %xmm0, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm7
+; SSE-NEXT: pand %xmm7, %xmm9
+; SSE-NEXT: pandn %xmm0, %xmm7
+; SSE-NEXT: por %xmm9, %xmm7
+; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm6
-; SSE-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm8
+; SSE-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm0, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm7, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: pand %xmm1, %xmm7
+; SSE-NEXT: movdqa %xmm9, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm1
+; SSE-NEXT: pand %xmm1, %xmm9
; SSE-NEXT: pandn %xmm2, %xmm1
-; SSE-NEXT: por %xmm7, %xmm1
-; SSE-NEXT: movdqa %xmm8, %xmm2
+; SSE-NEXT: por %xmm9, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm6, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: movdqa %xmm8, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: pcmpgtd %xmm2, %xmm7
+; SSE-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: pandn %xmm8, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm9, %xmm3
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: pand %xmm2, %xmm8
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm5, %xmm3
; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm6
-; SSE-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE-NEXT: pxor %xmm6, %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm7
+; SSE-NEXT: pcmpgtd %xmm3, %xmm7
; SSE-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE-NEXT: pand %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: pand %xmm3, %xmm5
-; SSE-NEXT: pandn %xmm9, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
+; SSE-NEXT: pand %xmm3, %xmm6
+; SSE-NEXT: pandn %xmm5, %xmm3
+; SSE-NEXT: por %xmm6, %xmm3
; SSE-NEXT: retq
;
; AVX2-LABEL: reassociate_smax_v8i64:
@@ -2712,13 +2712,13 @@ define <8 x i64> @reassociate_umin_v8i64(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm5, %xmm6
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3]
-; SSE-NEXT: por %xmm6, %xmm9
-; SSE-NEXT: pand %xmm9, %xmm8
-; SSE-NEXT: pandn %xmm3, %xmm9
-; SSE-NEXT: por %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm5
+; SSE-NEXT: pand %xmm5, %xmm8
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: por %xmm8, %xmm5
+; SSE-NEXT: movdqa %xmm9, %xmm3
; SSE-NEXT: pxor %xmm4, %xmm3
; SSE-NEXT: movdqa %xmm2, %xmm6
; SSE-NEXT: pxor %xmm4, %xmm6
@@ -2728,106 +2728,106 @@ define <8 x i64> @reassociate_umin_v8i64(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm3, %xmm6
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSE-NEXT: por %xmm6, %xmm8
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pandn %xmm2, %xmm8
-; SSE-NEXT: por %xmm5, %xmm8
-; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm3
+; SSE-NEXT: pand %xmm3, %xmm9
+; SSE-NEXT: pandn %xmm2, %xmm3
+; SSE-NEXT: por %xmm9, %xmm3
+; SSE-NEXT: movdqa %xmm8, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: pand %xmm2, %xmm5
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE-NEXT: por %xmm5, %xmm2
-; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: pcmpgtd %xmm2, %xmm7
+; SSE-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm2, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: pand %xmm2, %xmm8
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm7, %xmm1
+; SSE-NEXT: por %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm9, %xmm1
; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm6
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm5
-; SSE-NEXT: pand %xmm5, %xmm7
-; SSE-NEXT: pandn %xmm0, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm7
+; SSE-NEXT: pand %xmm7, %xmm9
+; SSE-NEXT: pandn %xmm0, %xmm7
+; SSE-NEXT: por %xmm9, %xmm7
+; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm6
-; SSE-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: pcmpgtd %xmm6, %xmm8
+; SSE-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm0, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm7, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: pand %xmm1, %xmm7
+; SSE-NEXT: movdqa %xmm9, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm1, %xmm7
+; SSE-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm1
+; SSE-NEXT: pand %xmm1, %xmm9
; SSE-NEXT: pandn %xmm2, %xmm1
-; SSE-NEXT: por %xmm7, %xmm1
-; SSE-NEXT: movdqa %xmm8, %xmm2
+; SSE-NEXT: por %xmm9, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm6, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: movdqa %xmm8, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm2, %xmm7
+; SSE-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: pandn %xmm8, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm9, %xmm3
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: pand %xmm2, %xmm8
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm5, %xmm3
; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm4
-; SSE-NEXT: movdqa %xmm3, %xmm6
-; SSE-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE-NEXT: pxor %xmm6, %xmm4
+; SSE-NEXT: movdqa %xmm3, %xmm7
+; SSE-NEXT: pcmpgtd %xmm4, %xmm7
; SSE-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE-NEXT: pand %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: pand %xmm3, %xmm5
-; SSE-NEXT: pandn %xmm9, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
+; SSE-NEXT: pand %xmm3, %xmm6
+; SSE-NEXT: pandn %xmm5, %xmm3
+; SSE-NEXT: por %xmm6, %xmm3
; SSE-NEXT: retq
;
; AVX2-LABEL: reassociate_umin_v8i64:
@@ -3093,13 +3093,13 @@ define <8 x i64> @reassociate_smin_v8i64(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm5, %xmm6
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3]
-; SSE-NEXT: por %xmm6, %xmm9
-; SSE-NEXT: pand %xmm9, %xmm8
-; SSE-NEXT: pandn %xmm3, %xmm9
-; SSE-NEXT: por %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm5
+; SSE-NEXT: pand %xmm5, %xmm8
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: por %xmm8, %xmm5
+; SSE-NEXT: movdqa %xmm9, %xmm3
; SSE-NEXT: pxor %xmm4, %xmm3
; SSE-NEXT: movdqa %xmm2, %xmm6
; SSE-NEXT: pxor %xmm4, %xmm6
@@ -3109,106 +3109,106 @@ define <8 x i64> @reassociate_smin_v8i64(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm3, %xmm6
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSE-NEXT: por %xmm6, %xmm8
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pandn %xmm2, %xmm8
-; SSE-NEXT: por %xmm5, %xmm8
-; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm3
+; SSE-NEXT: pand %xmm3, %xmm9
+; SSE-NEXT: pandn %xmm2, %xmm3
+; SSE-NEXT: por %xmm9, %xmm3
+; SSE-NEXT: movdqa %xmm8, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: pand %xmm2, %xmm5
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE-NEXT: por %xmm5, %xmm2
-; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: pcmpgtd %xmm2, %xmm7
+; SSE-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm2, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: pand %xmm2, %xmm8
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm7, %xmm1
+; SSE-NEXT: por %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm9, %xmm1
; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm6
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm5
-; SSE-NEXT: pand %xmm5, %xmm7
-; SSE-NEXT: pandn %xmm0, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm7
+; SSE-NEXT: pand %xmm7, %xmm9
+; SSE-NEXT: pandn %xmm0, %xmm7
+; SSE-NEXT: por %xmm9, %xmm7
+; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm6
-; SSE-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: pcmpgtd %xmm6, %xmm8
+; SSE-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm0, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm7, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: pand %xmm1, %xmm7
+; SSE-NEXT: movdqa %xmm9, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm1, %xmm7
+; SSE-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
+; SSE-NEXT: por %xmm6, %xmm1
+; SSE-NEXT: pand %xmm1, %xmm9
; SSE-NEXT: pandn %xmm2, %xmm1
-; SSE-NEXT: por %xmm7, %xmm1
-; SSE-NEXT: movdqa %xmm8, %xmm2
+; SSE-NEXT: por %xmm9, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm6, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: movdqa %xmm8, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm2, %xmm7
+; SSE-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: pandn %xmm8, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm9, %xmm3
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: pand %xmm2, %xmm8
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm5, %xmm3
; SSE-NEXT: pxor %xmm4, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm4
-; SSE-NEXT: movdqa %xmm3, %xmm6
-; SSE-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE-NEXT: pxor %xmm6, %xmm4
+; SSE-NEXT: movdqa %xmm3, %xmm7
+; SSE-NEXT: pcmpgtd %xmm4, %xmm7
; SSE-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE-NEXT: pand %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: pand %xmm3, %xmm5
-; SSE-NEXT: pandn %xmm9, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
+; SSE-NEXT: pand %xmm3, %xmm6
+; SSE-NEXT: pandn %xmm5, %xmm3
+; SSE-NEXT: por %xmm6, %xmm3
; SSE-NEXT: retq
;
; AVX2-LABEL: reassociate_smin_v8i64:
diff --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll
index c868524a1bb3f..6d7394180253b 100644
--- a/llvm/test/CodeGen/X86/machine-cp.ll
+++ b/llvm/test/CodeGen/X86/machine-cp.ll
@@ -100,29 +100,29 @@ define <16 x float> @foo(<16 x float> %x) {
; CHECK-LABEL: foo:
; CHECK: ## %bb.0: ## %bb
; CHECK-NEXT: movaps %xmm3, %xmm9
-; CHECK-NEXT: movaps %xmm2, %xmm8
+; CHECK-NEXT: movaps %xmm2, %xmm5
; CHECK-NEXT: movaps %xmm0, %xmm7
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: movaps %xmm3, %xmm2
; CHECK-NEXT: cmpltps %xmm0, %xmm2
; CHECK-NEXT: movaps %xmm2, %xmm4
; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; CHECK-NEXT: movaps %xmm4, %xmm10
-; CHECK-NEXT: andnps %xmm2, %xmm10
-; CHECK-NEXT: movaps %xmm8, %xmm5
-; CHECK-NEXT: cmpltps %xmm0, %xmm5
+; CHECK-NEXT: movaps %xmm4, %xmm8
+; CHECK-NEXT: andnps %xmm2, %xmm8
+; CHECK-NEXT: movaps %xmm5, %xmm6
+; CHECK-NEXT: cmpltps %xmm0, %xmm6
; CHECK-NEXT: movaps {{.*#+}} xmm11 = [9,10,11,12]
-; CHECK-NEXT: movaps %xmm5, %xmm2
+; CHECK-NEXT: movaps %xmm6, %xmm2
; CHECK-NEXT: orps %xmm11, %xmm2
-; CHECK-NEXT: movaps %xmm2, %xmm14
-; CHECK-NEXT: andnps %xmm5, %xmm14
+; CHECK-NEXT: movaps %xmm2, %xmm10
+; CHECK-NEXT: andnps %xmm6, %xmm10
; CHECK-NEXT: cvttps2dq %xmm1, %xmm12
; CHECK-NEXT: cmpltps %xmm0, %xmm1
; CHECK-NEXT: movaps {{.*#+}} xmm13 = [5,6,7,8]
; CHECK-NEXT: movaps %xmm1, %xmm6
; CHECK-NEXT: orps %xmm13, %xmm6
-; CHECK-NEXT: movaps %xmm6, %xmm5
-; CHECK-NEXT: andnps %xmm1, %xmm5
+; CHECK-NEXT: movaps %xmm6, %xmm14
+; CHECK-NEXT: andnps %xmm1, %xmm14
; CHECK-NEXT: cvttps2dq %xmm7, %xmm3
; CHECK-NEXT: cmpltps %xmm0, %xmm7
; CHECK-NEXT: movaps {{.*#+}} xmm15 = [1,2,3,4]
@@ -139,20 +139,20 @@ define <16 x float> @foo(<16 x float> %x) {
; CHECK-NEXT: andps %xmm13, %xmm6
; CHECK-NEXT: cvtdq2ps %xmm12, %xmm1
; CHECK-NEXT: andps %xmm1, %xmm6
-; CHECK-NEXT: andps %xmm3, %xmm5
-; CHECK-NEXT: orps %xmm5, %xmm6
+; CHECK-NEXT: andps %xmm3, %xmm14
+; CHECK-NEXT: orps %xmm14, %xmm6
; CHECK-NEXT: andps %xmm11, %xmm2
-; CHECK-NEXT: cvttps2dq %xmm8, %xmm1
+; CHECK-NEXT: cvttps2dq %xmm5, %xmm1
; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1
; CHECK-NEXT: andps %xmm1, %xmm2
-; CHECK-NEXT: andps %xmm3, %xmm14
-; CHECK-NEXT: orps %xmm14, %xmm2
; CHECK-NEXT: andps %xmm3, %xmm10
+; CHECK-NEXT: orps %xmm10, %xmm2
+; CHECK-NEXT: andps %xmm3, %xmm8
; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; CHECK-NEXT: cvttps2dq %xmm9, %xmm1
; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1
; CHECK-NEXT: andps %xmm1, %xmm4
-; CHECK-NEXT: orps %xmm10, %xmm4
+; CHECK-NEXT: orps %xmm8, %xmm4
; CHECK-NEXT: movaps %xmm6, %xmm1
; CHECK-NEXT: movaps %xmm4, %xmm3
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index e7648990c1493..4785531945b05 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -350,7 +350,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
; SSE2-LABEL: _Z10test_shortPsS_i_1024:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movl %edx, %eax
-; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm4
@@ -362,28 +362,28 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm5
; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm6
; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm7
-; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm9
-; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm0
-; SSE2-NEXT: pmaddwd %xmm5, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm0
-; SSE2-NEXT: pmaddwd %xmm6, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm4
-; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm0
-; SSE2-NEXT: pmaddwd %xmm7, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm0
-; SSE2-NEXT: pmaddwd %xmm9, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm3
+; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm8
+; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm9
+; SSE2-NEXT: pmaddwd %xmm5, %xmm9
+; SSE2-NEXT: paddd %xmm9, %xmm2
+; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm5
+; SSE2-NEXT: pmaddwd %xmm6, %xmm5
+; SSE2-NEXT: paddd %xmm5, %xmm4
+; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm5
+; SSE2-NEXT: pmaddwd %xmm7, %xmm5
+; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm5
+; SSE2-NEXT: pmaddwd %xmm8, %xmm5
+; SSE2-NEXT: paddd %xmm5, %xmm3
; SSE2-NEXT: addq $16, %rcx
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB3_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: paddd %xmm8, %xmm4
-; SSE2-NEXT: paddd %xmm8, %xmm3
+; SSE2-NEXT: paddd %xmm0, %xmm4
+; SSE2-NEXT: paddd %xmm0, %xmm3
; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: paddd %xmm8, %xmm2
-; SSE2-NEXT: paddd %xmm8, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
@@ -934,7 +934,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
; SSE2-LABEL: _Z9test_charPcS_i_1024:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movl %edx, %eax
-; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm4
@@ -944,42 +944,42 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
; SSE2-NEXT: .LBB7_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm7
-; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm10
-; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm0
-; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm9
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
-; SSE2-NEXT: psraw $8, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm6
+; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm8
+; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; SSE2-NEXT: psraw $8, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
+; SSE2-NEXT: psraw $8, %xmm10
+; SSE2-NEXT: pmaddwd %xmm9, %xmm10
+; SSE2-NEXT: paddd %xmm10, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm7
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm8
+; SSE2-NEXT: pmaddwd %xmm7, %xmm8
+; SSE2-NEXT: paddd %xmm8, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: psraw $8, %xmm7
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; SSE2-NEXT: psraw $8, %xmm8
+; SSE2-NEXT: pmaddwd %xmm7, %xmm8
+; SSE2-NEXT: paddd %xmm8, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm6
-; SSE2-NEXT: pmaddwd %xmm5, %xmm6
-; SSE2-NEXT: paddd %xmm6, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: pmaddwd %xmm5, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
-; SSE2-NEXT: psraw $8, %xmm5
-; SSE2-NEXT: pmaddwd %xmm0, %xmm5
-; SSE2-NEXT: paddd %xmm5, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
-; SSE2-NEXT: psraw $8, %xmm5
-; SSE2-NEXT: pmaddwd %xmm0, %xmm5
+; SSE2-NEXT: pmaddwd %xmm6, %xmm5
; SSE2-NEXT: paddd %xmm5, %xmm3
; SSE2-NEXT: addq $32, %rcx
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB7_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: paddd %xmm8, %xmm4
-; SSE2-NEXT: paddd %xmm8, %xmm3
+; SSE2-NEXT: paddd %xmm0, %xmm4
+; SSE2-NEXT: paddd %xmm0, %xmm3
; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: paddd %xmm8, %xmm2
-; SSE2-NEXT: paddd %xmm8, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
@@ -1388,20 +1388,20 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
; SSE2-NEXT: .LBB10_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm4
-; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm8
+; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm5
; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm6
; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm7
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pmulhuw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm6, %xmm8
+; SSE2-NEXT: pmulhuw %xmm4, %xmm8
; SSE2-NEXT: pmullw %xmm4, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
; SSE2-NEXT: paddd %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm7, %xmm4
-; SSE2-NEXT: pmulhuw %xmm8, %xmm4
-; SSE2-NEXT: pmullw %xmm8, %xmm7
+; SSE2-NEXT: pmulhuw %xmm5, %xmm4
+; SSE2-NEXT: pmullw %xmm5, %xmm7
; SSE2-NEXT: movdqa %xmm7, %xmm5
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; SSE2-NEXT: paddd %xmm5, %xmm3
@@ -1564,11 +1564,11 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
; SSE2-LABEL: test_unsigned_short_1024:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movl %edx, %eax
-; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pxor %xmm5, %xmm5
@@ -1576,59 +1576,59 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB11_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm0
-; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pmulhuw %xmm0, %xmm2
-; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: paddd %xmm0, %xmm7
-; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm2
-; SSE2-NEXT: paddd %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pmulhuw %xmm0, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: paddd %xmm0, %xmm6
-; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pmulhuw %xmm0, %xmm2
-; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: paddd %xmm0, %xmm8
-; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm2
-; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pmulhuw %xmm0, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: paddd %xmm0, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: paddd %xmm2, %xmm10
+; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm8
+; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pmulhuw %xmm8, %xmm10
+; SSE2-NEXT: pmullw %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
+; SSE2-NEXT: paddd %xmm8, %xmm7
+; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm10
+; SSE2-NEXT: paddd %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm10, %xmm9
+; SSE2-NEXT: pmulhuw %xmm8, %xmm9
+; SSE2-NEXT: pmullw %xmm8, %xmm10
+; SSE2-NEXT: movdqa %xmm10, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
+; SSE2-NEXT: paddd %xmm8, %xmm6
+; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm9
+; SSE2-NEXT: paddd %xmm10, %xmm4
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pmulhuw %xmm8, %xmm10
+; SSE2-NEXT: pmullw %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
+; SSE2-NEXT: paddd %xmm8, %xmm0
+; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm10
+; SSE2-NEXT: paddd %xmm9, %xmm3
+; SSE2-NEXT: movdqa %xmm10, %xmm9
+; SSE2-NEXT: pmulhuw %xmm8, %xmm9
+; SSE2-NEXT: pmullw %xmm8, %xmm10
+; SSE2-NEXT: movdqa %xmm10, %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; SSE2-NEXT: paddd %xmm8, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSE2-NEXT: paddd %xmm10, %xmm2
; SSE2-NEXT: addq $16, %rcx
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB11_1
; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm6, %xmm3
-; SSE2-NEXT: paddd %xmm7, %xmm10
-; SSE2-NEXT: paddd %xmm3, %xmm10
-; SSE2-NEXT: paddd %xmm4, %xmm8
-; SSE2-NEXT: paddd %xmm5, %xmm9
-; SSE2-NEXT: paddd %xmm10, %xmm9
-; SSE2-NEXT: paddd %xmm8, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
-; SSE2-NEXT: paddd %xmm9, %xmm0
+; SSE2-NEXT: paddd %xmm7, %xmm2
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
@@ -1637,10 +1637,10 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
; AVX1-LABEL: test_unsigned_short_1024:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movl %edx, %eax
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: xorl %ecx, %ecx
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB11_1: # %vector.body
@@ -1649,57 +1649,57 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm9 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm11 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm0, %xmm7, %xmm13
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm12, %xmm7, %xmm7
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm10, %xmm0, %xmm10
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm11, %xmm0, %xmm11
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm0
-; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8
-; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0
-; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm0
-; AVX1-NEXT: vpaddd %xmm7, %xmm9, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0
-; AVX1-NEXT: vpaddd %xmm0, %xmm10, %xmm0
-; AVX1-NEXT: vpaddd %xmm3, %xmm11, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmulld %xmm4, %xmm12, %xmm4
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmulld %xmm5, %xmm12, %xmm5
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmulld %xmm6, %xmm12, %xmm6
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmulld %xmm7, %xmm12, %xmm7
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmulld %xmm8, %xmm12, %xmm8
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmulld %xmm9, %xmm12, %xmm9
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm10
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmulld %xmm11, %xmm12, %xmm11
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm12
+; AVX1-NEXT: vpaddd %xmm4, %xmm12, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm4
+; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm4
+; AVX1-NEXT: vpaddd %xmm3, %xmm11, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: addq $16, %rcx
; AVX1-NEXT: cmpq %rcx, %rax
; AVX1-NEXT: jne .LBB11_1
; AVX1-NEXT: # %bb.2: # %middle.block
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm4
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -2220,8 +2220,8 @@ define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) {
; AVX1-NEXT: vpmaddwd %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
; AVX1-NEXT: vpmaddwd %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
@@ -2653,8 +2653,8 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %arg2, <8 x i16>* %arg3, <8 x i16>* %arg4, <8 x i16>* %arg5, <8 x i16>* %arg6, <8 x i16>* %arg7) {
; SSE2-LABEL: madd_quad_reduction:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movdqu (%rsi), %xmm1
; SSE2-NEXT: pmaddwd %xmm0, %xmm1
@@ -2665,8 +2665,8 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
; SSE2-NEXT: movdqu (%r9), %xmm3
; SSE2-NEXT: pmaddwd %xmm0, %xmm3
; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: movdqu (%rax), %xmm0
-; SSE2-NEXT: movdqu (%r10), %xmm1
+; SSE2-NEXT: movdqu (%r10), %xmm0
+; SSE2-NEXT: movdqu (%rax), %xmm1
; SSE2-NEXT: pmaddwd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
@@ -2679,8 +2679,8 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
;
; AVX-LABEL: madd_quad_reduction:
; AVX: # %bb.0:
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX-NEXT: vmovdqu (%rdi), %xmm0
; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqu (%rdx), %xmm1
@@ -2688,8 +2688,8 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
; AVX-NEXT: vmovdqu (%r8), %xmm2
; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2
; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovdqu (%rax), %xmm2
-; AVX-NEXT: vpmaddwd (%r10), %xmm2, %xmm2
+; AVX-NEXT: vmovdqu (%r10), %xmm2
+; AVX-NEXT: vpmaddwd (%rax), %xmm2, %xmm2
; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/masked-iv-unsafe.ll b/llvm/test/CodeGen/X86/masked-iv-unsafe.ll
index 725762446c203..045c42627a397 100644
--- a/llvm/test/CodeGen/X86/masked-iv-unsafe.ll
+++ b/llvm/test/CodeGen/X86/masked-iv-unsafe.ll
@@ -341,7 +341,7 @@ return:
define void @another_count_up_signed(ptr %d, i64 %n) nounwind {
; CHECK-LABEL: another_count_up_signed:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xorl %r8d, %r8d
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
@@ -350,22 +350,22 @@ define void @another_count_up_signed(ptr %d, i64 %n) nounwind {
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB6_1: # %loop
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movq %r8, %rax
-; CHECK-NEXT: sarq $8, %rax
+; CHECK-NEXT: movq %rax, %r8
+; CHECK-NEXT: sarq $8, %r8
; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
; CHECK-NEXT: mulsd %xmm0, %xmm3
-; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8)
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: sarq $24, %rax
+; CHECK-NEXT: movsd %xmm3, (%rdi,%r8,8)
+; CHECK-NEXT: movq %rcx, %r8
+; CHECK-NEXT: sarq $24, %r8
; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
; CHECK-NEXT: mulsd %xmm1, %xmm3
-; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8)
+; CHECK-NEXT: movsd %xmm3, (%rdi,%r8,8)
; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
; CHECK-NEXT: mulsd %xmm2, %xmm3
; CHECK-NEXT: movsd %xmm3, (%rdx)
; CHECK-NEXT: addq $8, %rdx
; CHECK-NEXT: addq $16777216, %rcx # imm = 0x1000000
-; CHECK-NEXT: addq $256, %r8 # imm = 0x100
+; CHECK-NEXT: addq $256, %rax # imm = 0x100
; CHECK-NEXT: decq %rsi
; CHECK-NEXT: jne .LBB6_1
; CHECK-NEXT: # %bb.2: # %return
diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll
index 65491586558f4..9c75a13be5d7e 100644
--- a/llvm/test/CodeGen/X86/masked_compressstore.ll
+++ b/llvm/test/CodeGen/X86/masked_compressstore.ll
@@ -1876,12 +1876,12 @@ define void @compressstore_v32f32_v32i32(ptr %base, <32 x float> %V, <32 x i32>
; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8
; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5
+; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8
+; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4
-; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpacksswb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpackssdw %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpmovmskb %xmm4, %ecx
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index 57406af00b30f..ae2ef409ab0c8 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -2024,12 +2024,12 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32
; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8
; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5
+; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8
+; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4
-; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpacksswb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpackssdw %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpmovmskb %xmm4, %ecx
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4
diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll
index 9247cb1d4d914..b20ce3a616098 100644
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@@ -593,13 +593,13 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8
; SSE-LABEL: gather_v16i8_v16i32_v16i8:
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
; SSE-NEXT: pmovsxdq %xmm0, %xmm0
-; SSE-NEXT: paddq %xmm8, %xmm0
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: pcmpeqb %xmm4, %xmm6
-; SSE-NEXT: pmovmskb %xmm6, %eax
+; SSE-NEXT: paddq %xmm6, %xmm0
+; SSE-NEXT: pxor %xmm8, %xmm8
+; SSE-NEXT: pcmpeqb %xmm4, %xmm8
+; SSE-NEXT: pmovmskb %xmm8, %eax
; SSE-NEXT: testb $1, %al
; SSE-NEXT: je .LBB3_2
; SSE-NEXT: # %bb.1: # %cond.load
@@ -613,7 +613,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8
; SSE-NEXT: pextrq $1, %xmm0, %rcx
; SSE-NEXT: pinsrb $1, (%rcx), %xmm5
; SSE-NEXT: .LBB3_4: # %else2
-; SSE-NEXT: paddq %xmm8, %xmm4
+; SSE-NEXT: paddq %xmm6, %xmm4
; SSE-NEXT: testb $4, %al
; SSE-NEXT: je .LBB3_6
; SSE-NEXT: # %bb.5: # %cond.load4
@@ -628,7 +628,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8
; SSE-NEXT: pinsrb $3, (%rcx), %xmm5
; SSE-NEXT: .LBB3_8: # %else8
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE-NEXT: paddq %xmm8, %xmm0
+; SSE-NEXT: paddq %xmm6, %xmm0
; SSE-NEXT: testb $16, %al
; SSE-NEXT: je .LBB3_10
; SSE-NEXT: # %bb.9: # %cond.load10
@@ -642,7 +642,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8
; SSE-NEXT: pextrq $1, %xmm0, %rcx
; SSE-NEXT: pinsrb $5, (%rcx), %xmm5
; SSE-NEXT: .LBB3_12: # %else14
-; SSE-NEXT: paddq %xmm8, %xmm1
+; SSE-NEXT: paddq %xmm6, %xmm1
; SSE-NEXT: testb $64, %al
; SSE-NEXT: je .LBB3_14
; SSE-NEXT: # %bb.13: # %cond.load16
@@ -657,7 +657,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8
; SSE-NEXT: pinsrb $7, (%rcx), %xmm5
; SSE-NEXT: .LBB3_16: # %else20
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; SSE-NEXT: paddq %xmm8, %xmm0
+; SSE-NEXT: paddq %xmm6, %xmm0
; SSE-NEXT: testl $256, %eax # imm = 0x100
; SSE-NEXT: je .LBB3_18
; SSE-NEXT: # %bb.17: # %cond.load22
@@ -671,7 +671,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8
; SSE-NEXT: pextrq $1, %xmm0, %rcx
; SSE-NEXT: pinsrb $9, (%rcx), %xmm5
; SSE-NEXT: .LBB3_20: # %else26
-; SSE-NEXT: paddq %xmm8, %xmm1
+; SSE-NEXT: paddq %xmm6, %xmm1
; SSE-NEXT: testl $1024, %eax # imm = 0x400
; SSE-NEXT: je .LBB3_22
; SSE-NEXT: # %bb.21: # %cond.load28
@@ -686,7 +686,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8
; SSE-NEXT: pinsrb $11, (%rcx), %xmm5
; SSE-NEXT: .LBB3_24: # %else32
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; SSE-NEXT: paddq %xmm8, %xmm0
+; SSE-NEXT: paddq %xmm6, %xmm0
; SSE-NEXT: testl $4096, %eax # imm = 0x1000
; SSE-NEXT: je .LBB3_26
; SSE-NEXT: # %bb.25: # %cond.load34
@@ -700,7 +700,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8
; SSE-NEXT: pextrq $1, %xmm0, %rcx
; SSE-NEXT: pinsrb $13, (%rcx), %xmm5
; SSE-NEXT: .LBB3_28: # %else38
-; SSE-NEXT: paddq %xmm1, %xmm8
+; SSE-NEXT: paddq %xmm1, %xmm6
; SSE-NEXT: testl $16384, %eax # imm = 0x4000
; SSE-NEXT: jne .LBB3_29
; SSE-NEXT: # %bb.30: # %else41
@@ -710,12 +710,12 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8
; SSE-NEXT: movdqa %xmm5, %xmm0
; SSE-NEXT: retq
; SSE-NEXT: .LBB3_29: # %cond.load40
-; SSE-NEXT: movq %xmm8, %rcx
+; SSE-NEXT: movq %xmm6, %rcx
; SSE-NEXT: pinsrb $14, (%rcx), %xmm5
; SSE-NEXT: testl $32768, %eax # imm = 0x8000
; SSE-NEXT: je .LBB3_32
; SSE-NEXT: .LBB3_31: # %cond.load43
-; SSE-NEXT: pextrq $1, %xmm8, %rax
+; SSE-NEXT: pextrq $1, %xmm6, %rax
; SSE-NEXT: pinsrb $15, (%rax), %xmm5
; SSE-NEXT: movdqa %xmm5, %xmm0
; SSE-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index d4775afc0c417..56eca7ca5ef03 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -576,20 +576,18 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x double
define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double> %dst) {
; SSE2-LABEL: load_v8f64_v8i64:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movdqa %xmm7, %xmm8
-; SSE2-NEXT: movaps %xmm6, %xmm9
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,0,3,2]
+; SSE2-NEXT: pand %xmm3, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: packssdw %xmm6, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm1
+; SSE2-NEXT: packssdw %xmm9, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: packssdw %xmm2, %xmm1
@@ -620,12 +618,12 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB6_16
; SSE2-NEXT: LBB6_15: ## %cond.load19
-; SSE2-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1]
+; SSE2-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1]
; SSE2-NEXT: LBB6_16: ## %else20
; SSE2-NEXT: movaps %xmm4, %xmm0
; SSE2-NEXT: movaps %xmm5, %xmm1
-; SSE2-NEXT: movaps %xmm9, %xmm2
-; SSE2-NEXT: movaps %xmm8, %xmm3
+; SSE2-NEXT: movaps %xmm6, %xmm2
+; SSE2-NEXT: movaps %xmm7, %xmm3
; SSE2-NEXT: retq
; SSE2-NEXT: LBB6_1: ## %cond.load
; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
@@ -644,28 +642,27 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB6_10
; SSE2-NEXT: LBB6_9: ## %cond.load10
-; SSE2-NEXT: movlps {{.*#+}} xmm9 = mem[0,1],xmm9[2,3]
+; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB6_12
; SSE2-NEXT: LBB6_11: ## %cond.load13
-; SSE2-NEXT: movhps {{.*#+}} xmm9 = xmm9[0,1],mem[0,1]
+; SSE2-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB6_14
; SSE2-NEXT: LBB6_13: ## %cond.load16
-; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
+; SSE2-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB6_15
; SSE2-NEXT: jmp LBB6_16
;
; SSE42-LABEL: load_v8f64_v8i64:
; SSE42: ## %bb.0:
-; SSE42-NEXT: movdqa %xmm7, %xmm8
-; SSE42-NEXT: pxor %xmm7, %xmm7
-; SSE42-NEXT: pcmpeqq %xmm7, %xmm3
-; SSE42-NEXT: pcmpeqq %xmm7, %xmm2
+; SSE42-NEXT: pxor %xmm8, %xmm8
+; SSE42-NEXT: pcmpeqq %xmm8, %xmm3
+; SSE42-NEXT: pcmpeqq %xmm8, %xmm2
; SSE42-NEXT: packssdw %xmm3, %xmm2
-; SSE42-NEXT: pcmpeqq %xmm7, %xmm1
-; SSE42-NEXT: pcmpeqq %xmm7, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm8, %xmm1
+; SSE42-NEXT: pcmpeqq %xmm8, %xmm0
; SSE42-NEXT: packssdw %xmm1, %xmm0
; SSE42-NEXT: packssdw %xmm2, %xmm0
; SSE42-NEXT: packsswb %xmm0, %xmm0
@@ -694,12 +691,12 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: je LBB6_16
; SSE42-NEXT: LBB6_15: ## %cond.load19
-; SSE42-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1]
+; SSE42-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1]
; SSE42-NEXT: LBB6_16: ## %else20
; SSE42-NEXT: movaps %xmm4, %xmm0
; SSE42-NEXT: movaps %xmm5, %xmm1
; SSE42-NEXT: movaps %xmm6, %xmm2
-; SSE42-NEXT: movaps %xmm8, %xmm3
+; SSE42-NEXT: movaps %xmm7, %xmm3
; SSE42-NEXT: retq
; SSE42-NEXT: LBB6_1: ## %cond.load
; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
@@ -726,7 +723,7 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: je LBB6_14
; SSE42-NEXT: LBB6_13: ## %cond.load16
-; SSE42-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
+; SSE42-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: jne LBB6_15
; SSE42-NEXT: jmp LBB6_16
@@ -1977,20 +1974,18 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i64> %dst
define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst) {
; SSE2-LABEL: load_v8i64_v8i64:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movdqa %xmm7, %xmm8
-; SSE2-NEXT: movaps %xmm6, %xmm9
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,0,3,2]
+; SSE2-NEXT: pand %xmm3, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: packssdw %xmm6, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm1
+; SSE2-NEXT: packssdw %xmm9, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: packssdw %xmm2, %xmm1
@@ -2021,13 +2016,13 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB16_16
; SSE2-NEXT: LBB16_15: ## %cond.load19
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0]
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
; SSE2-NEXT: LBB16_16: ## %else20
; SSE2-NEXT: movaps %xmm4, %xmm0
; SSE2-NEXT: movaps %xmm5, %xmm1
-; SSE2-NEXT: movaps %xmm9, %xmm2
-; SSE2-NEXT: movdqa %xmm8, %xmm3
+; SSE2-NEXT: movaps %xmm6, %xmm2
+; SSE2-NEXT: movaps %xmm7, %xmm3
; SSE2-NEXT: retq
; SSE2-NEXT: LBB16_1: ## %cond.load
; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
@@ -2048,29 +2043,28 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB16_10
; SSE2-NEXT: LBB16_9: ## %cond.load10
-; SSE2-NEXT: movlps {{.*#+}} xmm9 = mem[0,1],xmm9[2,3]
+; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB16_12
; SSE2-NEXT: LBB16_11: ## %cond.load13
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB16_14
; SSE2-NEXT: LBB16_13: ## %cond.load16
-; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
+; SSE2-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB16_15
; SSE2-NEXT: jmp LBB16_16
;
; SSE42-LABEL: load_v8i64_v8i64:
; SSE42: ## %bb.0:
-; SSE42-NEXT: movdqa %xmm7, %xmm8
-; SSE42-NEXT: pxor %xmm7, %xmm7
-; SSE42-NEXT: pcmpeqq %xmm7, %xmm3
-; SSE42-NEXT: pcmpeqq %xmm7, %xmm2
+; SSE42-NEXT: pxor %xmm8, %xmm8
+; SSE42-NEXT: pcmpeqq %xmm8, %xmm3
+; SSE42-NEXT: pcmpeqq %xmm8, %xmm2
; SSE42-NEXT: packssdw %xmm3, %xmm2
-; SSE42-NEXT: pcmpeqq %xmm7, %xmm1
-; SSE42-NEXT: pcmpeqq %xmm7, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm8, %xmm1
+; SSE42-NEXT: pcmpeqq %xmm8, %xmm0
; SSE42-NEXT: packssdw %xmm1, %xmm0
; SSE42-NEXT: packssdw %xmm2, %xmm0
; SSE42-NEXT: packsswb %xmm0, %xmm0
@@ -2099,12 +2093,12 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: je LBB16_16
; SSE42-NEXT: LBB16_15: ## %cond.load19
-; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm8
+; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm7
; SSE42-NEXT: LBB16_16: ## %else20
; SSE42-NEXT: movdqa %xmm4, %xmm0
; SSE42-NEXT: movdqa %xmm5, %xmm1
; SSE42-NEXT: movdqa %xmm6, %xmm2
-; SSE42-NEXT: movdqa %xmm8, %xmm3
+; SSE42-NEXT: movdqa %xmm7, %xmm3
; SSE42-NEXT: retq
; SSE42-NEXT: LBB16_1: ## %cond.load
; SSE42-NEXT: pinsrq $0, (%rdi), %xmm4
@@ -2131,7 +2125,7 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst
; SSE42-NEXT: testb $64, %al
; SSE42-NEXT: je LBB16_14
; SSE42-NEXT: LBB16_13: ## %cond.load16
-; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm8
+; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm7
; SSE42-NEXT: testb $-128, %al
; SSE42-NEXT: jne LBB16_15
; SSE42-NEXT: jmp LBB16_16
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index d85e2da0094ba..e0358dcc75747 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -11,108 +11,108 @@
define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
-; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pxor %xmm12, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa %xmm11, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm10
-; SSE2-NEXT: pand %xmm10, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm10
-; SSE2-NEXT: por %xmm2, %xmm10
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm12, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm11, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm11
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm7
-; SSE2-NEXT: por %xmm3, %xmm7
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm12, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm11, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm9, %xmm6
+; SSE2-NEXT: por %xmm2, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm11
+; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm9, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm11
+; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm9, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm12, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm9, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067968,18446744071562067968]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm12, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm14
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [18446744069414584320,18446744069414584320]
-; SSE2-NEXT: pcmpgtd %xmm13, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm6
-; SSE2-NEXT: por %xmm2, %xmm6
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm12, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm13, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm1
+; SSE2-NEXT: pandn %xmm9, %xmm11
+; SSE2-NEXT: por %xmm1, %xmm11
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067968,18446744071562067968]
+; SSE2-NEXT: movdqa %xmm11, %xmm1
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm12
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744069414584320,18446744069414584320]
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
+; SSE2-NEXT: pand %xmm12, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm11
+; SSE2-NEXT: pandn %xmm0, %xmm12
+; SSE2-NEXT: por %xmm11, %xmm12
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
-; SSE2-NEXT: movdqa %xmm7, %xmm0
-; SSE2-NEXT: pxor %xmm12, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm13, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSE2-NEXT: pxor %xmm10, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm13, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,0,2,2]
-; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE2-NEXT: pxor %xmm14, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm14, %xmm4
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm12[0,2]
+; SSE2-NEXT: movdqa %xmm2, %xmm11
+; SSE2-NEXT: pxor %xmm8, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2]
+; SSE2-NEXT: pand %xmm12, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT: pxor %xmm6, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm12, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm5
+; SSE2-NEXT: pxor %xmm9, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: packsswb %xmm4, %xmm4
; SSE2-NEXT: pmovmskb %xmm4, %eax
@@ -122,32 +122,32 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: movss %xmm1, (%rdi)
; SSE2-NEXT: .LBB0_2: # %else
; SSE2-NEXT: por %xmm11, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm6
+; SSE2-NEXT: por %xmm8, %xmm10
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB0_4
; SSE2-NEXT: # %bb.3: # %cond.store1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE2-NEXT: movd %xmm0, 4(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; SSE2-NEXT: movd %xmm4, 4(%rdi)
; SSE2-NEXT: .LBB0_4: # %else2
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: pand %xmm6, %xmm10
-; SSE2-NEXT: pandn %xmm9, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm10, %xmm6
+; SSE2-NEXT: pandn %xmm0, %xmm10
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB0_6
; SSE2-NEXT: # %bb.5: # %cond.store3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
; SSE2-NEXT: .LBB0_6: # %else4
-; SSE2-NEXT: por %xmm3, %xmm7
-; SSE2-NEXT: por %xmm6, %xmm10
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: por %xmm10, %xmm6
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB0_8
; SSE2-NEXT: # %bb.7: # %cond.store5
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
; SSE2-NEXT: movd %xmm0, 12(%rdi)
; SSE2-NEXT: .LBB0_8: # %else6
-; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm7[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm2[0,2]
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: jne .LBB0_9
; SSE2-NEXT: # %bb.10: # %else8
@@ -162,65 +162,65 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: .LBB0_16: # %else14
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB0_9: # %cond.store7
-; SSE2-NEXT: movss %xmm10, 16(%rdi)
+; SSE2-NEXT: movss %xmm6, 16(%rdi)
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je .LBB0_12
; SSE2-NEXT: .LBB0_11: # %cond.store9
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
; SSE2-NEXT: movd %xmm0, 20(%rdi)
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je .LBB0_14
; SSE2-NEXT: .LBB0_13: # %cond.store11
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 24(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB0_16
; SSE2-NEXT: .LBB0_15: # %cond.store13
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
; SSE2-NEXT: movd %xmm0, 28(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v8i64_v8i32:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm9
-; SSE4-NEXT: pxor %xmm8, %xmm8
-; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647]
-; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm6
+; SSE4-NEXT: pxor %xmm7, %xmm7
+; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
+; SSE4-NEXT: movdqa %xmm9, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: movdqa %xmm7, %xmm10
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm10
-; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm9, %xmm8
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8
+; SSE4-NEXT: movdqa %xmm9, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm7, %xmm6
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6
-; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
-; SSE4-NEXT: movdqa %xmm7, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3
-; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm9, %xmm10
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm10
+; SSE4-NEXT: movdqa %xmm9, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: movdqa %xmm9, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3
+; SSE4-NEXT: movdqa %xmm9, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9
; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; SSE4-NEXT: movapd %xmm7, %xmm0
+; SSE4-NEXT: movapd %xmm9, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm2
-; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm2
+; SSE4-NEXT: movdqa %xmm1, %xmm6
+; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm6
; SSE4-NEXT: movapd %xmm3, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm7
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE4-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2]
-; SSE4-NEXT: movapd %xmm6, %xmm0
+; SSE4-NEXT: movdqa %xmm1, %xmm2
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE4-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
+; SSE4-NEXT: movapd %xmm10, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: movdqa %xmm1, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3
-; SSE4-NEXT: movapd %xmm10, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm3
+; SSE4-NEXT: movapd %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm1
-; SSE4-NEXT: pcmpeqd %xmm8, %xmm5
+; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1
+; SSE4-NEXT: pcmpeqd %xmm7, %xmm5
; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
; SSE4-NEXT: pxor %xmm0, %xmm5
-; SSE4-NEXT: pcmpeqd %xmm8, %xmm4
+; SSE4-NEXT: pcmpeqd %xmm7, %xmm4
; SSE4-NEXT: pxor %xmm0, %xmm4
; SSE4-NEXT: packssdw %xmm5, %xmm4
; SSE4-NEXT: packsswb %xmm4, %xmm4
@@ -237,7 +237,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB0_8
; SSE4-NEXT: .LBB0_7: # %cond.store5
-; SSE4-NEXT: extractps $3, %xmm7, 12(%rdi)
+; SSE4-NEXT: extractps $3, %xmm2, 12(%rdi)
; SSE4-NEXT: .LBB0_8: # %else6
; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
; SSE4-NEXT: testb $16, %al
@@ -254,15 +254,15 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE4-NEXT: .LBB0_16: # %else14
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB0_1: # %cond.store
-; SSE4-NEXT: movss %xmm7, (%rdi)
+; SSE4-NEXT: movss %xmm2, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB0_4
; SSE4-NEXT: .LBB0_3: # %cond.store1
-; SSE4-NEXT: extractps $1, %xmm7, 4(%rdi)
+; SSE4-NEXT: extractps $1, %xmm2, 4(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB0_6
; SSE4-NEXT: .LBB0_5: # %cond.store3
-; SSE4-NEXT: extractps $2, %xmm7, 8(%rdi)
+; SSE4-NEXT: extractps $2, %xmm2, 8(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: jne .LBB0_7
; SSE4-NEXT: jmp .LBB0_8
@@ -383,118 +383,118 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [32767,32767]
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pxor %xmm11, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147516415,2147516415]
-; SSE2-NEXT: movdqa %xmm12, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm10
-; SSE2-NEXT: pand %xmm10, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm10
-; SSE2-NEXT: por %xmm2, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: pxor %xmm7, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147516415,2147516415]
+; SSE2-NEXT: movdqa %xmm11, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm9, %xmm8
+; SSE2-NEXT: por %xmm2, %xmm8
; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm12, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm7
-; SSE2-NEXT: por %xmm3, %xmm7
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm12, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2-NEXT: movdqa %xmm11, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm13
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm9, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744073709518848,18446744073709518848]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm13, %xmm13
-; SSE2-NEXT: pcmpeqd %xmm13, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [18446744071562035200,18446744071562035200]
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm6
-; SSE2-NEXT: por %xmm2, %xmm6
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm13, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm13, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE2-NEXT: por %xmm13, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm7
-; SSE2-NEXT: pandn %xmm9, %xmm1
-; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm13, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm10
; SSE2-NEXT: pandn %xmm9, %xmm2
-; SSE2-NEXT: por %xmm10, %xmm2
-; SSE2-NEXT: packssdw %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2-NEXT: movdqa %xmm11, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: pandn %xmm9, %xmm10
+; SSE2-NEXT: por %xmm0, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm1
+; SSE2-NEXT: pandn %xmm9, %xmm11
+; SSE2-NEXT: por %xmm1, %xmm11
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; SSE2-NEXT: movdqa %xmm11, %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm12
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200]
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm12, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm11
+; SSE2-NEXT: pandn %xmm3, %xmm12
+; SSE2-NEXT: por %xmm11, %xmm12
+; SSE2-NEXT: movdqa %xmm10, %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm10
+; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: por %xmm10, %xmm0
+; SSE2-NEXT: packssdw %xmm12, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm10
+; SSE2-NEXT: pxor %xmm7, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm12, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: pandn %xmm3, %xmm10
+; SSE2-NEXT: por %xmm2, %xmm10
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
+; SSE2-NEXT: pand %xmm2, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm8
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm8, %xmm2
+; SSE2-NEXT: packssdw %xmm10, %xmm2
; SSE2-NEXT: packssdw %xmm2, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE2-NEXT: pxor %xmm13, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm13, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: packsswb %xmm4, %xmm4
; SSE2-NEXT: pmovmskb %xmm4, %eax
@@ -565,47 +565,47 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v8i64_v8i16:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm9
-; SSE4-NEXT: pxor %xmm8, %xmm8
-; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [32767,32767]
-; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm6
+; SSE4-NEXT: pxor %xmm7, %xmm7
+; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [32767,32767]
+; SSE4-NEXT: movdqa %xmm9, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: movdqa %xmm7, %xmm10
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm10
-; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm9, %xmm8
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8
+; SSE4-NEXT: movdqa %xmm9, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm7, %xmm2
+; SSE4-NEXT: movdqa %xmm9, %xmm2
; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
-; SSE4-NEXT: movdqa %xmm7, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3
-; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7
-; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; SSE4-NEXT: movapd %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm9, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: movdqa %xmm9, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3
+; SSE4-NEXT: movdqa %xmm9, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm6
-; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm6
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9
+; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709518848,18446744073709518848]
+; SSE4-NEXT: movapd %xmm9, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: movdqa %xmm6, %xmm10
+; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm10
; SSE4-NEXT: movapd %xmm3, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm7
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE4-NEXT: packssdw %xmm6, %xmm7
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: movdqa %xmm6, %xmm1
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE4-NEXT: packssdw %xmm10, %xmm1
; SSE4-NEXT: movapd %xmm2, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm3
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: movdqa %xmm6, %xmm3
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE4-NEXT: movapd %xmm10, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm1
-; SSE4-NEXT: packssdw %xmm3, %xmm1
-; SSE4-NEXT: packssdw %xmm1, %xmm7
-; SSE4-NEXT: pcmpeqd %xmm8, %xmm5
+; SSE4-NEXT: movapd %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm6
+; SSE4-NEXT: packssdw %xmm3, %xmm6
+; SSE4-NEXT: packssdw %xmm6, %xmm1
+; SSE4-NEXT: pcmpeqd %xmm7, %xmm5
; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
; SSE4-NEXT: pxor %xmm0, %xmm5
-; SSE4-NEXT: pcmpeqd %xmm8, %xmm4
+; SSE4-NEXT: pcmpeqd %xmm7, %xmm4
; SSE4-NEXT: pxor %xmm0, %xmm4
; SSE4-NEXT: packssdw %xmm5, %xmm4
; SSE4-NEXT: packsswb %xmm4, %xmm4
@@ -636,35 +636,35 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE4-NEXT: .LBB1_16: # %else14
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB1_1: # %cond.store
-; SSE4-NEXT: pextrw $0, %xmm7, (%rdi)
+; SSE4-NEXT: pextrw $0, %xmm1, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB1_4
; SSE4-NEXT: .LBB1_3: # %cond.store1
-; SSE4-NEXT: pextrw $1, %xmm7, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm1, 2(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB1_6
; SSE4-NEXT: .LBB1_5: # %cond.store3
-; SSE4-NEXT: pextrw $2, %xmm7, 4(%rdi)
+; SSE4-NEXT: pextrw $2, %xmm1, 4(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB1_8
; SSE4-NEXT: .LBB1_7: # %cond.store5
-; SSE4-NEXT: pextrw $3, %xmm7, 6(%rdi)
+; SSE4-NEXT: pextrw $3, %xmm1, 6(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB1_10
; SSE4-NEXT: .LBB1_9: # %cond.store7
-; SSE4-NEXT: pextrw $4, %xmm7, 8(%rdi)
+; SSE4-NEXT: pextrw $4, %xmm1, 8(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB1_12
; SSE4-NEXT: .LBB1_11: # %cond.store9
-; SSE4-NEXT: pextrw $5, %xmm7, 10(%rdi)
+; SSE4-NEXT: pextrw $5, %xmm1, 10(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB1_14
; SSE4-NEXT: .LBB1_13: # %cond.store11
-; SSE4-NEXT: pextrw $6, %xmm7, 12(%rdi)
+; SSE4-NEXT: pextrw $6, %xmm1, 12(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB1_16
; SSE4-NEXT: .LBB1_15: # %cond.store13
-; SSE4-NEXT: pextrw $7, %xmm7, 14(%rdi)
+; SSE4-NEXT: pextrw $7, %xmm1, 14(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i64_v8i16:
@@ -933,119 +933,119 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [127,127]
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pxor %xmm11, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483775,2147483775]
-; SSE2-NEXT: movdqa %xmm12, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm10
-; SSE2-NEXT: pand %xmm10, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm10
-; SSE2-NEXT: por %xmm2, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: pxor %xmm7, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775]
+; SSE2-NEXT: movdqa %xmm11, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm9, %xmm8
+; SSE2-NEXT: por %xmm2, %xmm8
; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm12, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm7
-; SSE2-NEXT: por %xmm3, %xmm7
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm12, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2-NEXT: movdqa %xmm11, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm13
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm9, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744073709551488,18446744073709551488]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm13, %xmm13
-; SSE2-NEXT: pcmpeqd %xmm13, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm6
-; SSE2-NEXT: por %xmm2, %xmm6
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm13, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm13, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm7
-; SSE2-NEXT: pandn %xmm9, %xmm1
-; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm13, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm12, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2]
+; SSE2-NEXT: por %xmm13, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm10
; SSE2-NEXT: pandn %xmm9, %xmm2
-; SSE2-NEXT: por %xmm10, %xmm2
-; SSE2-NEXT: packssdw %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2-NEXT: movdqa %xmm11, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: pandn %xmm9, %xmm10
+; SSE2-NEXT: por %xmm0, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm1
+; SSE2-NEXT: pandn %xmm9, %xmm11
+; SSE2-NEXT: por %xmm1, %xmm11
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT: movdqa %xmm11, %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm12
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm12, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm11
+; SSE2-NEXT: pandn %xmm3, %xmm12
+; SSE2-NEXT: por %xmm11, %xmm12
+; SSE2-NEXT: movdqa %xmm10, %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm10
+; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: por %xmm10, %xmm0
+; SSE2-NEXT: packssdw %xmm12, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm10
+; SSE2-NEXT: pxor %xmm7, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm12, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: pandn %xmm3, %xmm10
+; SSE2-NEXT: por %xmm2, %xmm10
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
+; SSE2-NEXT: pand %xmm2, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm8
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm8, %xmm2
+; SSE2-NEXT: packssdw %xmm10, %xmm2
; SSE2-NEXT: packssdw %xmm2, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE2-NEXT: pxor %xmm13, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm13, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: packsswb %xmm4, %xmm4
; SSE2-NEXT: pmovmskb %xmm4, %eax
@@ -1109,48 +1109,48 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v8i64_v8i8:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm9
-; SSE4-NEXT: pxor %xmm8, %xmm8
-; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [127,127]
-; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm6
+; SSE4-NEXT: pxor %xmm7, %xmm7
+; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [127,127]
+; SSE4-NEXT: movdqa %xmm9, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: movdqa %xmm7, %xmm10
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm10
-; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm9, %xmm8
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8
+; SSE4-NEXT: movdqa %xmm9, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm7, %xmm2
+; SSE4-NEXT: movdqa %xmm9, %xmm2
; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
-; SSE4-NEXT: movdqa %xmm7, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3
-; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7
-; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; SSE4-NEXT: movapd %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm9, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: movdqa %xmm9, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3
+; SSE4-NEXT: movdqa %xmm9, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm6
-; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm6
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9
+; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488]
+; SSE4-NEXT: movapd %xmm9, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: movdqa %xmm6, %xmm10
+; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm10
; SSE4-NEXT: movapd %xmm3, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm7
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE4-NEXT: packssdw %xmm6, %xmm7
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: movdqa %xmm6, %xmm1
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE4-NEXT: packssdw %xmm10, %xmm1
; SSE4-NEXT: movapd %xmm2, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm3
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: movdqa %xmm6, %xmm3
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE4-NEXT: movapd %xmm10, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm1
-; SSE4-NEXT: packssdw %xmm3, %xmm1
-; SSE4-NEXT: packssdw %xmm1, %xmm7
-; SSE4-NEXT: packsswb %xmm7, %xmm7
-; SSE4-NEXT: pcmpeqd %xmm8, %xmm5
+; SSE4-NEXT: movapd %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm6
+; SSE4-NEXT: packssdw %xmm3, %xmm6
+; SSE4-NEXT: packssdw %xmm6, %xmm1
+; SSE4-NEXT: packsswb %xmm1, %xmm1
+; SSE4-NEXT: pcmpeqd %xmm7, %xmm5
; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
; SSE4-NEXT: pxor %xmm0, %xmm5
-; SSE4-NEXT: pcmpeqd %xmm8, %xmm4
+; SSE4-NEXT: pcmpeqd %xmm7, %xmm4
; SSE4-NEXT: pxor %xmm0, %xmm4
; SSE4-NEXT: packssdw %xmm5, %xmm4
; SSE4-NEXT: packsswb %xmm4, %xmm4
@@ -1181,35 +1181,35 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE4-NEXT: .LBB2_16: # %else14
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB2_1: # %cond.store
-; SSE4-NEXT: pextrb $0, %xmm7, (%rdi)
+; SSE4-NEXT: pextrb $0, %xmm1, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB2_4
; SSE4-NEXT: .LBB2_3: # %cond.store1
-; SSE4-NEXT: pextrb $1, %xmm7, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm1, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB2_6
; SSE4-NEXT: .LBB2_5: # %cond.store3
-; SSE4-NEXT: pextrb $2, %xmm7, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm1, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB2_8
; SSE4-NEXT: .LBB2_7: # %cond.store5
-; SSE4-NEXT: pextrb $3, %xmm7, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm1, 3(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB2_10
; SSE4-NEXT: .LBB2_9: # %cond.store7
-; SSE4-NEXT: pextrb $4, %xmm7, 4(%rdi)
+; SSE4-NEXT: pextrb $4, %xmm1, 4(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB2_12
; SSE4-NEXT: .LBB2_11: # %cond.store9
-; SSE4-NEXT: pextrb $5, %xmm7, 5(%rdi)
+; SSE4-NEXT: pextrb $5, %xmm1, 5(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB2_14
; SSE4-NEXT: .LBB2_13: # %cond.store11
-; SSE4-NEXT: pextrb $6, %xmm7, 6(%rdi)
+; SSE4-NEXT: pextrb $6, %xmm1, 6(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB2_16
; SSE4-NEXT: .LBB2_15: # %cond.store13
-; SSE4-NEXT: pextrb $7, %xmm7, 7(%rdi)
+; SSE4-NEXT: pextrb $7, %xmm1, 7(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i64_v8i8:
@@ -1480,63 +1480,63 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i64_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa %xmm6, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm9, %xmm5
+; SSE2-NEXT: pandn %xmm6, %xmm5
; SSE2-NEXT: por %xmm0, %xmm5
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067968,18446744071562067968]
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320]
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,2,2]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: por %xmm1, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320]
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: por %xmm7, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pandn %xmm9, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
; SSE2-NEXT: movmskps %xmm2, %eax
; SSE2-NEXT: xorl $15, %eax
; SSE2-NEXT: testb $1, %al
@@ -1709,64 +1709,64 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i64_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [32767,32767]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147516415,2147516415]
-; SSE2-NEXT: movdqa %xmm6, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147516415,2147516415]
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm9, %xmm5
+; SSE2-NEXT: pandn %xmm6, %xmm5
; SSE2-NEXT: por %xmm0, %xmm5
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744073709518848,18446744073709518848]
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200]
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,2,2]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: por %xmm1, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200]
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: por %xmm7, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pandn %xmm9, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: packssdw %xmm4, %xmm0
+; SSE2-NEXT: packssdw %xmm6, %xmm0
; SSE2-NEXT: packssdw %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
; SSE2-NEXT: movmskps %xmm2, %eax
; SSE2-NEXT: xorl $15, %eax
; SSE2-NEXT: testb $1, %al
@@ -2023,68 +2023,68 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i64_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [127,127]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [127,127]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775]
-; SSE2-NEXT: movdqa %xmm6, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483775,2147483775]
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm9, %xmm5
+; SSE2-NEXT: pandn %xmm6, %xmm5
; SSE2-NEXT: por %xmm1, %xmm5
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744073709551488,18446744073709551488]
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: por %xmm0, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm6, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm9, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: por %xmm10, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: pandn %xmm1, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm4, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
; SSE2-NEXT: movmskps %xmm2, %ecx
; SSE2-NEXT: xorl $15, %ecx
; SSE2-NEXT: testb $1, %cl
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 53d7421c5aff9..b2129a0cc0b95 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -11,91 +11,91 @@
define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: pxor %xmm11, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [9223372039002259455,9223372039002259455]
-; SSE2-NEXT: movdqa %xmm12, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pandn %xmm9, %xmm6
-; SSE2-NEXT: por %xmm1, %xmm6
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: pxor %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm11
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
+; SSE2-NEXT: movdqa %xmm9, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm1
+; SSE2-NEXT: pandn %xmm6, %xmm11
+; SSE2-NEXT: por %xmm1, %xmm11
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
+; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm10
-; SSE2-NEXT: movdqa %xmm12, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm13
-; SSE2-NEXT: pand %xmm13, %xmm0
-; SSE2-NEXT: pandn %xmm9, %xmm13
-; SSE2-NEXT: por %xmm0, %xmm13
-; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm6[0,2]
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm1
-; SSE2-NEXT: movdqa %xmm12, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: movdqa %xmm9, %xmm12
; SSE2-NEXT: pcmpgtd %xmm1, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2]
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm2, %xmm11
+; SSE2-NEXT: pxor %xmm8, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm11, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm12, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm11, %xmm11
+; SSE2-NEXT: pxor %xmm11, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE2-NEXT: pxor %xmm11, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: packsswb %xmm4, %xmm4
; SSE2-NEXT: pmovmskb %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je .LBB0_2
; SSE2-NEXT: # %bb.1: # %cond.store
-; SSE2-NEXT: movss %xmm13, (%rdi)
+; SSE2-NEXT: movss %xmm1, (%rdi)
; SSE2-NEXT: .LBB0_2: # %else
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: por %xmm9, %xmm8
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB0_4
; SSE2-NEXT: # %bb.3: # %cond.store1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1]
-; SSE2-NEXT: movd %xmm1, 4(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; SSE2-NEXT: movd %xmm4, 4(%rdi)
; SSE2-NEXT: .LBB0_4: # %else2
; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm0
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm8
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB0_6
; SSE2-NEXT: # %bb.5: # %cond.store3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3]
-; SSE2-NEXT: movd %xmm1, 8(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE2-NEXT: movd %xmm4, 8(%rdi)
; SSE2-NEXT: .LBB0_6: # %else4
; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: por %xmm8, %xmm2
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB0_8
; SSE2-NEXT: # %bb.7: # %cond.store5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
; SSE2-NEXT: movd %xmm0, 12(%rdi)
; SSE2-NEXT: .LBB0_8: # %else6
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
@@ -133,34 +133,34 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v8i64_v8i32:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm8
+; SSE4-NEXT: movdqa %xmm0, %xmm6
; SSE4-NEXT: pxor %xmm9, %xmm9
-; SSE4-NEXT: movapd {{.*#+}} xmm10 = [4294967295,4294967295]
-; SSE4-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: movdqa %xmm1, %xmm6
-; SSE4-NEXT: pxor %xmm11, %xmm6
+; SSE4-NEXT: movapd {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: movdqa %xmm1, %xmm11
+; SSE4-NEXT: pxor %xmm10, %xmm11
; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372041149743103,9223372041149743103]
; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: movapd %xmm10, %xmm6
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6
-; SSE4-NEXT: movdqa %xmm8, %xmm1
-; SSE4-NEXT: pxor %xmm11, %xmm1
+; SSE4-NEXT: pcmpgtq %xmm11, %xmm0
+; SSE4-NEXT: movapd %xmm8, %xmm11
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm11
+; SSE4-NEXT: movdqa %xmm6, %xmm1
+; SSE4-NEXT: pxor %xmm10, %xmm1
; SSE4-NEXT: movdqa %xmm7, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movapd %xmm10, %xmm1
-; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1
-; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
+; SSE4-NEXT: movapd %xmm8, %xmm1
+; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1
+; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2]
; SSE4-NEXT: movdqa %xmm3, %xmm6
-; SSE4-NEXT: pxor %xmm11, %xmm6
+; SSE4-NEXT: pxor %xmm10, %xmm6
; SSE4-NEXT: movdqa %xmm7, %xmm0
; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: movapd %xmm10, %xmm8
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
-; SSE4-NEXT: pxor %xmm2, %xmm11
-; SSE4-NEXT: pcmpgtq %xmm11, %xmm7
+; SSE4-NEXT: movapd %xmm8, %xmm6
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6
+; SSE4-NEXT: pxor %xmm2, %xmm10
+; SSE4-NEXT: pcmpgtq %xmm10, %xmm7
; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm10
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8
; SSE4-NEXT: pcmpeqd %xmm9, %xmm5
; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
; SSE4-NEXT: pxor %xmm0, %xmm5
@@ -183,7 +183,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE4-NEXT: .LBB0_7: # %cond.store5
; SSE4-NEXT: extractps $3, %xmm1, 12(%rdi)
; SSE4-NEXT: .LBB0_8: # %else6
-; SSE4-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2]
+; SSE4-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[0,2]
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: jne .LBB0_9
; SSE4-NEXT: # %bb.10: # %else8
@@ -211,19 +211,19 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE4-NEXT: jne .LBB0_7
; SSE4-NEXT: jmp .LBB0_8
; SSE4-NEXT: .LBB0_9: # %cond.store7
-; SSE4-NEXT: movss %xmm10, 16(%rdi)
+; SSE4-NEXT: movss %xmm8, 16(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB0_12
; SSE4-NEXT: .LBB0_11: # %cond.store9
-; SSE4-NEXT: extractps $1, %xmm10, 20(%rdi)
+; SSE4-NEXT: extractps $1, %xmm8, 20(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB0_14
; SSE4-NEXT: .LBB0_13: # %cond.store11
-; SSE4-NEXT: extractps $2, %xmm10, 24(%rdi)
+; SSE4-NEXT: extractps $2, %xmm8, 24(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB0_16
; SSE4-NEXT: .LBB0_15: # %cond.store13
-; SSE4-NEXT: extractps $3, %xmm10, 28(%rdi)
+; SSE4-NEXT: extractps $3, %xmm8, 28(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i64_v8i32:
@@ -235,7 +235,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm8
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103]
@@ -246,17 +246,17 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm7
; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm6, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm8
+; AVX1-NEXT: vblendvpd %xmm8, %xmm1, %xmm6, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm6, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX1-NEXT: vmaskmovps %ymm0, %ymm8, (%rdi)
+; AVX1-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -318,76 +318,76 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm10, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm12
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002324991,9223372039002324991]
-; SSE2-NEXT: movdqa %xmm11, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm12, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm12
-; SSE2-NEXT: pand %xmm12, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm12
-; SSE2-NEXT: por %xmm2, %xmm12
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535]
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: pxor %xmm9, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm11
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002324991,9223372039002324991]
+; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm7, %xmm8
+; SSE2-NEXT: por %xmm2, %xmm8
; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm6
-; SSE2-NEXT: movdqa %xmm11, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm11, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm9, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm11
+; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm9, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm11
+; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm11
+; SSE2-NEXT: por %xmm0, %xmm11
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm9, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm9, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm7, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: packsswb %xmm4, %xmm4
@@ -459,36 +459,36 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v8i64_v8i16:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm9
+; SSE4-NEXT: movdqa %xmm0, %xmm6
; SSE4-NEXT: pxor %xmm8, %xmm8
-; SSE4-NEXT: movapd {{.*#+}} xmm6 = [65535,65535]
+; SSE4-NEXT: movapd {{.*#+}} xmm9 = [65535,65535]
; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: movdqa %xmm1, %xmm7
-; SSE4-NEXT: pxor %xmm10, %xmm7
-; SSE4-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854841343,9223372036854841343]
-; SSE4-NEXT: movdqa %xmm11, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
-; SSE4-NEXT: movapd %xmm6, %xmm7
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7
-; SSE4-NEXT: movdqa %xmm9, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm11
+; SSE4-NEXT: pxor %xmm10, %xmm11
+; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854841343,9223372036854841343]
+; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm11, %xmm0
+; SSE4-NEXT: movapd %xmm9, %xmm11
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm11
+; SSE4-NEXT: movdqa %xmm6, %xmm1
; SSE4-NEXT: pxor %xmm10, %xmm1
-; SSE4-NEXT: movdqa %xmm11, %xmm0
+; SSE4-NEXT: movdqa %xmm7, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movapd %xmm6, %xmm1
-; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm1
-; SSE4-NEXT: packusdw %xmm7, %xmm1
-; SSE4-NEXT: movdqa %xmm3, %xmm7
-; SSE4-NEXT: pxor %xmm10, %xmm7
-; SSE4-NEXT: movdqa %xmm11, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
-; SSE4-NEXT: movapd %xmm6, %xmm7
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT: movapd %xmm9, %xmm1
+; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1
+; SSE4-NEXT: packusdw %xmm11, %xmm1
+; SSE4-NEXT: movdqa %xmm3, %xmm6
+; SSE4-NEXT: pxor %xmm10, %xmm6
+; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: movapd %xmm9, %xmm6
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6
; SSE4-NEXT: pxor %xmm2, %xmm10
-; SSE4-NEXT: pcmpgtq %xmm10, %xmm11
-; SSE4-NEXT: movdqa %xmm11, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
-; SSE4-NEXT: packusdw %xmm7, %xmm6
-; SSE4-NEXT: packusdw %xmm6, %xmm1
+; SSE4-NEXT: pcmpgtq %xmm10, %xmm7
+; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9
+; SSE4-NEXT: packusdw %xmm6, %xmm9
+; SSE4-NEXT: packusdw %xmm9, %xmm1
; SSE4-NEXT: pcmpeqd %xmm8, %xmm5
; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
; SSE4-NEXT: pxor %xmm0, %xmm5
@@ -812,75 +812,75 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm10, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm12
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT: movdqa %xmm11, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm12, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pandn %xmm9, %xmm7
-; SSE2-NEXT: por %xmm1, %xmm7
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: pxor %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm11
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT: movdqa %xmm9, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm1
+; SSE2-NEXT: pandn %xmm7, %xmm11
+; SSE2-NEXT: por %xmm1, %xmm11
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm12
-; SSE2-NEXT: movdqa %xmm11, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm12, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pandn %xmm9, %xmm6
-; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: packuswb %xmm7, %xmm6
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm11, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: movdqa %xmm9, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
+; SSE2-NEXT: pandn %xmm7, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
+; SSE2-NEXT: packuswb %xmm11, %xmm1
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: movdqa %xmm9, %xmm11
; SSE2-NEXT: pcmpgtd %xmm0, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm10
+; SSE2-NEXT: por %xmm3, %xmm10
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm3
; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: packuswb %xmm1, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm6
-; SSE2-NEXT: packuswb %xmm6, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
+; SSE2-NEXT: packuswb %xmm10, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm0, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: packsswb %xmm4, %xmm4
; SSE2-NEXT: pmovmskb %xmm4, %eax
; SSE2-NEXT: testb $1, %al
-; SSE2-NEXT: movd %xmm6, %ecx
+; SSE2-NEXT: movd %xmm1, %ecx
; SSE2-NEXT: jne .LBB2_1
; SSE2-NEXT: # %bb.2: # %else
; SSE2-NEXT: testb $2, %al
@@ -896,7 +896,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB2_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: pextrw $2, %xmm6, %ecx
+; SSE2-NEXT: pextrw $2, %xmm1, %ecx
; SSE2-NEXT: je .LBB2_10
; SSE2-NEXT: # %bb.9: # %cond.store7
; SSE2-NEXT: movb %cl, 4(%rdi)
@@ -907,7 +907,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB2_12: # %else10
; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: pextrw $3, %xmm6, %ecx
+; SSE2-NEXT: pextrw $3, %xmm1, %ecx
; SSE2-NEXT: jne .LBB2_13
; SSE2-NEXT: # %bb.14: # %else12
; SSE2-NEXT: testb $-128, %al
@@ -939,36 +939,36 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v8i64_v8i8:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm9
+; SSE4-NEXT: movdqa %xmm0, %xmm6
; SSE4-NEXT: pxor %xmm8, %xmm8
-; SSE4-NEXT: movapd {{.*#+}} xmm6 = [255,255]
+; SSE4-NEXT: movapd {{.*#+}} xmm9 = [255,255]
; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: movdqa %xmm1, %xmm7
-; SSE4-NEXT: pxor %xmm10, %xmm7
-; SSE4-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854776063,9223372036854776063]
-; SSE4-NEXT: movdqa %xmm11, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
-; SSE4-NEXT: movapd %xmm6, %xmm7
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7
-; SSE4-NEXT: movdqa %xmm9, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm11
+; SSE4-NEXT: pxor %xmm10, %xmm11
+; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854776063,9223372036854776063]
+; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm11, %xmm0
+; SSE4-NEXT: movapd %xmm9, %xmm11
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm11
+; SSE4-NEXT: movdqa %xmm6, %xmm1
; SSE4-NEXT: pxor %xmm10, %xmm1
-; SSE4-NEXT: movdqa %xmm11, %xmm0
+; SSE4-NEXT: movdqa %xmm7, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movapd %xmm6, %xmm1
-; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm1
-; SSE4-NEXT: packusdw %xmm7, %xmm1
-; SSE4-NEXT: movdqa %xmm3, %xmm7
-; SSE4-NEXT: pxor %xmm10, %xmm7
-; SSE4-NEXT: movdqa %xmm11, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
-; SSE4-NEXT: movapd %xmm6, %xmm7
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT: movapd %xmm9, %xmm1
+; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1
+; SSE4-NEXT: packusdw %xmm11, %xmm1
+; SSE4-NEXT: movdqa %xmm3, %xmm6
+; SSE4-NEXT: pxor %xmm10, %xmm6
+; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT: movapd %xmm9, %xmm6
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6
; SSE4-NEXT: pxor %xmm2, %xmm10
-; SSE4-NEXT: pcmpgtq %xmm10, %xmm11
-; SSE4-NEXT: movdqa %xmm11, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
-; SSE4-NEXT: packusdw %xmm7, %xmm6
-; SSE4-NEXT: packusdw %xmm6, %xmm1
+; SSE4-NEXT: pcmpgtq %xmm10, %xmm7
+; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9
+; SSE4-NEXT: packusdw %xmm6, %xmm9
+; SSE4-NEXT: packusdw %xmm9, %xmm1
; SSE4-NEXT: packuswb %xmm1, %xmm1
; SSE4-NEXT: pcmpeqd %xmm8, %xmm5
; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
@@ -1296,35 +1296,35 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i64_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm9, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455]
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455]
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2]
; SSE2-NEXT: pand %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm7
+; SSE2-NEXT: por %xmm1, %xmm7
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
+; SSE2-NEXT: pxor %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
; SSE2-NEXT: movmskps %xmm3, %eax
; SSE2-NEXT: xorl $15, %eax
@@ -1362,22 +1362,22 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v4i64_v4i32:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm8
+; SSE4-NEXT: movdqa %xmm0, %xmm3
; SSE4-NEXT: pxor %xmm6, %xmm6
; SSE4-NEXT: movapd {{.*#+}} xmm5 = [4294967295,4294967295]
; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: movdqa %xmm1, %xmm3
-; SSE4-NEXT: pxor %xmm7, %xmm3
+; SSE4-NEXT: movdqa %xmm1, %xmm8
+; SSE4-NEXT: pxor %xmm7, %xmm8
; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movapd %xmm5, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE4-NEXT: pxor %xmm8, %xmm7
+; SSE4-NEXT: pcmpgtq %xmm8, %xmm0
+; SSE4-NEXT: movapd %xmm5, %xmm8
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8
+; SSE4-NEXT: pxor %xmm3, %xmm7
; SSE4-NEXT: pcmpgtq %xmm7, %xmm4
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm5
-; SSE4-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2]
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE4-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm8[0,2]
; SSE4-NEXT: pcmpeqd %xmm2, %xmm6
; SSE4-NEXT: movmskps %xmm6, %eax
; SSE4-NEXT: xorl $15, %eax
@@ -1492,37 +1492,37 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i64_v4i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm9, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991]
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2]
; SSE2-NEXT: pand %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm7
+; SSE2-NEXT: por %xmm0, %xmm7
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm9, %xmm0
+; SSE2-NEXT: pxor %xmm5, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm5
; SSE2-NEXT: por %xmm1, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
@@ -1563,22 +1563,22 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v4i64_v4i16:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm8
+; SSE4-NEXT: movdqa %xmm0, %xmm3
; SSE4-NEXT: pxor %xmm6, %xmm6
; SSE4-NEXT: movapd {{.*#+}} xmm5 = [65535,65535]
; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: movdqa %xmm1, %xmm3
-; SSE4-NEXT: pxor %xmm7, %xmm3
+; SSE4-NEXT: movdqa %xmm1, %xmm8
+; SSE4-NEXT: pxor %xmm7, %xmm8
; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343]
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movapd %xmm5, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE4-NEXT: pxor %xmm8, %xmm7
+; SSE4-NEXT: pcmpgtq %xmm8, %xmm0
+; SSE4-NEXT: movapd %xmm5, %xmm8
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8
+; SSE4-NEXT: pxor %xmm3, %xmm7
; SSE4-NEXT: pcmpgtq %xmm7, %xmm4
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm5
-; SSE4-NEXT: packusdw %xmm3, %xmm5
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE4-NEXT: packusdw %xmm8, %xmm5
; SSE4-NEXT: packusdw %xmm5, %xmm5
; SSE4-NEXT: pcmpeqd %xmm2, %xmm6
; SSE4-NEXT: movmskps %xmm6, %eax
@@ -1775,42 +1775,42 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i64_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: packuswb %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: por %xmm1, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: packuswb %xmm6, %xmm4
; SSE2-NEXT: packuswb %xmm4, %xmm4
; SSE2-NEXT: packuswb %xmm4, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm10
-; SSE2-NEXT: movmskps %xmm10, %ecx
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: movmskps %xmm3, %ecx
; SSE2-NEXT: xorl $15, %ecx
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: movd %xmm4, %eax
@@ -1848,26 +1848,26 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE4-LABEL: truncstore_v4i64_v4i8:
; SSE4: # %bb.0:
; SSE4-NEXT: movdqa %xmm0, %xmm3
-; SSE4-NEXT: pxor %xmm8, %xmm8
+; SSE4-NEXT: pxor %xmm6, %xmm6
; SSE4-NEXT: movapd {{.*#+}} xmm7 = [255,255]
-; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm0, %xmm5
-; SSE4-NEXT: pxor %xmm6, %xmm5
+; SSE4-NEXT: pxor %xmm8, %xmm5
; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063]
; SSE4-NEXT: movdqa %xmm4, %xmm0
; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
; SSE4-NEXT: movapd %xmm7, %xmm5
; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5
-; SSE4-NEXT: pxor %xmm1, %xmm6
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm4
+; SSE4-NEXT: pxor %xmm1, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm8, %xmm4
; SSE4-NEXT: movdqa %xmm4, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; SSE4-NEXT: pshufb %xmm0, %xmm7
; SSE4-NEXT: pshufb %xmm0, %xmm5
; SSE4-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; SSE4-NEXT: pcmpeqd %xmm2, %xmm8
-; SSE4-NEXT: movmskps %xmm8, %eax
+; SSE4-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE4-NEXT: movmskps %xmm6, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB5_1
@@ -2511,18 +2511,18 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: pxor %xmm9, %xmm8
; SSE2-NEXT: por %xmm0, %xmm8
-; SSE2-NEXT: movdqa %xmm1, %xmm13
-; SSE2-NEXT: pxor %xmm11, %xmm13
-; SSE2-NEXT: movdqa %xmm10, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm13, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm11, %xmm0
+; SSE2-NEXT: movdqa %xmm10, %xmm13
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm1
+; SSE2-NEXT: pxor %xmm9, %xmm13
+; SSE2-NEXT: por %xmm1, %xmm13
+; SSE2-NEXT: pslld $16, %xmm13
+; SSE2-NEXT: psrad $16, %xmm13
; SSE2-NEXT: pslld $16, %xmm8
; SSE2-NEXT: psrad $16, %xmm8
-; SSE2-NEXT: packssdw %xmm0, %xmm8
+; SSE2-NEXT: packssdw %xmm13, %xmm8
; SSE2-NEXT: pcmpeqd %xmm12, %xmm7
; SSE2-NEXT: pxor %xmm9, %xmm7
; SSE2-NEXT: pcmpeqd %xmm12, %xmm6
@@ -3231,28 +3231,28 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE2-NEXT: pand %xmm13, %xmm1
; SSE2-NEXT: pandn %xmm10, %xmm13
; SSE2-NEXT: por %xmm1, %xmm13
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm12
-; SSE2-NEXT: pand %xmm12, %xmm0
-; SSE2-NEXT: pandn %xmm10, %xmm12
-; SSE2-NEXT: por %xmm0, %xmm12
-; SSE2-NEXT: packuswb %xmm13, %xmm12
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm12
+; SSE2-NEXT: pxor %xmm11, %xmm12
; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm12, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pandn %xmm10, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: packuswb %xmm13, %xmm1
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm11, %xmm0
+; SSE2-NEXT: movdqa %xmm9, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm3
+; SSE2-NEXT: pandn %xmm10, %xmm12
+; SSE2-NEXT: por %xmm3, %xmm12
; SSE2-NEXT: pxor %xmm2, %xmm11
; SSE2-NEXT: pcmpgtd %xmm11, %xmm9
; SSE2-NEXT: pand %xmm9, %xmm2
; SSE2-NEXT: pandn %xmm10, %xmm9
; SSE2-NEXT: por %xmm2, %xmm9
-; SSE2-NEXT: packuswb %xmm1, %xmm9
-; SSE2-NEXT: packuswb %xmm9, %xmm12
+; SSE2-NEXT: packuswb %xmm12, %xmm9
+; SSE2-NEXT: packuswb %xmm9, %xmm1
; SSE2-NEXT: pcmpeqd %xmm8, %xmm7
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm7
@@ -3267,7 +3267,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE2-NEXT: packsswb %xmm6, %xmm4
; SSE2-NEXT: pmovmskb %xmm4, %eax
; SSE2-NEXT: testb $1, %al
-; SSE2-NEXT: movd %xmm12, %ecx
+; SSE2-NEXT: movd %xmm1, %ecx
; SSE2-NEXT: jne .LBB10_1
; SSE2-NEXT: # %bb.2: # %else
; SSE2-NEXT: testb $2, %al
@@ -3283,7 +3283,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB10_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: pextrw $2, %xmm12, %ecx
+; SSE2-NEXT: pextrw $2, %xmm1, %ecx
; SSE2-NEXT: je .LBB10_10
; SSE2-NEXT: # %bb.9: # %cond.store7
; SSE2-NEXT: movb %cl, 4(%rdi)
@@ -3294,7 +3294,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB10_12: # %else10
; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: pextrw $3, %xmm12, %ecx
+; SSE2-NEXT: pextrw $3, %xmm1, %ecx
; SSE2-NEXT: je .LBB10_14
; SSE2-NEXT: # %bb.13: # %cond.store11
; SSE2-NEXT: movb %cl, 6(%rdi)
@@ -3305,7 +3305,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE2-NEXT: movb %ch, 7(%rdi)
; SSE2-NEXT: .LBB10_16: # %else14
; SSE2-NEXT: testl $256, %eax # imm = 0x100
-; SSE2-NEXT: pextrw $4, %xmm12, %ecx
+; SSE2-NEXT: pextrw $4, %xmm1, %ecx
; SSE2-NEXT: je .LBB10_18
; SSE2-NEXT: # %bb.17: # %cond.store15
; SSE2-NEXT: movb %cl, 8(%rdi)
@@ -3316,7 +3316,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE2-NEXT: movb %ch, 9(%rdi)
; SSE2-NEXT: .LBB10_20: # %else18
; SSE2-NEXT: testl $1024, %eax # imm = 0x400
-; SSE2-NEXT: pextrw $5, %xmm12, %ecx
+; SSE2-NEXT: pextrw $5, %xmm1, %ecx
; SSE2-NEXT: je .LBB10_22
; SSE2-NEXT: # %bb.21: # %cond.store19
; SSE2-NEXT: movb %cl, 10(%rdi)
@@ -3327,7 +3327,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE2-NEXT: movb %ch, 11(%rdi)
; SSE2-NEXT: .LBB10_24: # %else22
; SSE2-NEXT: testl $4096, %eax # imm = 0x1000
-; SSE2-NEXT: pextrw $6, %xmm12, %ecx
+; SSE2-NEXT: pextrw $6, %xmm1, %ecx
; SSE2-NEXT: je .LBB10_26
; SSE2-NEXT: # %bb.25: # %cond.store23
; SSE2-NEXT: movb %cl, 12(%rdi)
@@ -3338,7 +3338,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE2-NEXT: movb %ch, 13(%rdi)
; SSE2-NEXT: .LBB10_28: # %else26
; SSE2-NEXT: testl $16384, %eax # imm = 0x4000
-; SSE2-NEXT: pextrw $7, %xmm12, %ecx
+; SSE2-NEXT: pextrw $7, %xmm1, %ecx
; SSE2-NEXT: jne .LBB10_29
; SSE2-NEXT: # %bb.30: # %else28
; SSE2-NEXT: testl $32768, %eax # imm = 0x8000
@@ -3929,30 +3929,30 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i32_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: pxor %xmm7, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm8, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm6, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pxor %xmm6, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: pxor %xmm6, %xmm8
+; SSE2-NEXT: por %xmm1, %xmm8
+; SSE2-NEXT: pslld $16, %xmm8
+; SSE2-NEXT: psrad $16, %xmm8
; SSE2-NEXT: pslld $16, %xmm4
; SSE2-NEXT: psrad $16, %xmm4
-; SSE2-NEXT: pslld $16, %xmm5
-; SSE2-NEXT: psrad $16, %xmm5
-; SSE2-NEXT: packssdw %xmm4, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
+; SSE2-NEXT: packssdw %xmm8, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
; SSE2-NEXT: pxor %xmm6, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
; SSE2-NEXT: pxor %xmm6, %xmm2
; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: packsswb %xmm2, %xmm2
@@ -3983,42 +3983,42 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: .LBB11_16: # %else14
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB11_1: # %cond.store
-; SSE2-NEXT: movd %xmm5, %ecx
+; SSE2-NEXT: movd %xmm4, %ecx
; SSE2-NEXT: movw %cx, (%rdi)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB11_4
; SSE2-NEXT: .LBB11_3: # %cond.store1
-; SSE2-NEXT: pextrw $1, %xmm5, %ecx
+; SSE2-NEXT: pextrw $1, %xmm4, %ecx
; SSE2-NEXT: movw %cx, 2(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB11_6
; SSE2-NEXT: .LBB11_5: # %cond.store3
-; SSE2-NEXT: pextrw $2, %xmm5, %ecx
+; SSE2-NEXT: pextrw $2, %xmm4, %ecx
; SSE2-NEXT: movw %cx, 4(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB11_8
; SSE2-NEXT: .LBB11_7: # %cond.store5
-; SSE2-NEXT: pextrw $3, %xmm5, %ecx
+; SSE2-NEXT: pextrw $3, %xmm4, %ecx
; SSE2-NEXT: movw %cx, 6(%rdi)
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je .LBB11_10
; SSE2-NEXT: .LBB11_9: # %cond.store7
-; SSE2-NEXT: pextrw $4, %xmm5, %ecx
+; SSE2-NEXT: pextrw $4, %xmm4, %ecx
; SSE2-NEXT: movw %cx, 8(%rdi)
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je .LBB11_12
; SSE2-NEXT: .LBB11_11: # %cond.store9
-; SSE2-NEXT: pextrw $5, %xmm5, %ecx
+; SSE2-NEXT: pextrw $5, %xmm4, %ecx
; SSE2-NEXT: movw %cx, 10(%rdi)
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je .LBB11_14
; SSE2-NEXT: .LBB11_13: # %cond.store11
-; SSE2-NEXT: pextrw $6, %xmm5, %ecx
+; SSE2-NEXT: pextrw $6, %xmm4, %ecx
; SSE2-NEXT: movw %cx, 12(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB11_16
; SSE2-NEXT: .LBB11_15: # %cond.store13
-; SSE2-NEXT: pextrw $7, %xmm5, %eax
+; SSE2-NEXT: pextrw $7, %xmm4, %eax
; SSE2-NEXT: movw %ax, 14(%rdi)
; SSE2-NEXT: retq
;
@@ -4332,34 +4332,34 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i32_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255]
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm7, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm9, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: pxor %xmm7, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm6, %xmm9
+; SSE2-NEXT: por %xmm1, %xmm9
; SSE2-NEXT: pxor %xmm0, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pandn %xmm9, %xmm6
-; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: packuswb %xmm4, %xmm6
-; SSE2-NEXT: packuswb %xmm6, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: packuswb %xmm9, %xmm4
+; SSE2-NEXT: packuswb %xmm4, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: packsswb %xmm2, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
; SSE2-NEXT: testb $1, %al
-; SSE2-NEXT: movd %xmm6, %ecx
+; SSE2-NEXT: movd %xmm4, %ecx
; SSE2-NEXT: jne .LBB12_1
; SSE2-NEXT: # %bb.2: # %else
; SSE2-NEXT: testb $2, %al
@@ -4375,7 +4375,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB12_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: pextrw $2, %xmm6, %ecx
+; SSE2-NEXT: pextrw $2, %xmm4, %ecx
; SSE2-NEXT: je .LBB12_10
; SSE2-NEXT: # %bb.9: # %cond.store7
; SSE2-NEXT: movb %cl, 4(%rdi)
@@ -4386,7 +4386,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB12_12: # %else10
; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: pextrw $3, %xmm6, %ecx
+; SSE2-NEXT: pextrw $3, %xmm4, %ecx
; SSE2-NEXT: jne .LBB12_13
; SSE2-NEXT: # %bb.14: # %else12
; SSE2-NEXT: testb $-128, %al
@@ -5185,13 +5185,13 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
; SSE2-LABEL: truncstore_v32i16_v32i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: psubusw %xmm8, %xmm6
-; SSE2-NEXT: psubw %xmm6, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: psubusw %xmm8, %xmm6
-; SSE2-NEXT: psubw %xmm6, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: psubusw %xmm6, %xmm8
+; SSE2-NEXT: psubw %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: psubusw %xmm6, %xmm8
+; SSE2-NEXT: psubw %xmm8, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: pcmpeqb %xmm7, %xmm4
; SSE2-NEXT: pmovmskb %xmm4, %ecx
@@ -5268,9 +5268,9 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
; SSE2-NEXT: movb %cl, 12(%rdi)
; SSE2-NEXT: .LBB15_26: # %else24
; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: psubusw %xmm8, %xmm1
+; SSE2-NEXT: psubusw %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psubusw %xmm8, %xmm4
+; SSE2-NEXT: psubusw %xmm6, %xmm4
; SSE2-NEXT: testl $8192, %eax # imm = 0x2000
; SSE2-NEXT: je .LBB15_28
; SSE2-NEXT: # %bb.27: # %cond.store25
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index 9ed0178c0df42..f7505eddacb2c 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -485,25 +485,25 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
; AVX1-FALLBACK-NEXT: vpsubq %xmm6, %xmm3, %xmm3
; AVX1-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm3, %xmm9
+; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm3, %xmm6
; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm7
; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; AVX1-FALLBACK-NEXT: vpor %xmm2, %xmm8, %xmm2
; AVX1-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm6
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm7, %xmm6
-; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm6, %xmm1
+; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm9
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpmuludq %xmm2, %xmm7, %xmm2
; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm3, %xmm3
; AVX1-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm6
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm9, %xmm6
-; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm6, %xmm3
+; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
+; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm7, %xmm3
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm9, %xmm5
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm3, %xmm3
; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
@@ -536,12 +536,12 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
; XOP-FALLBACK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm3
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm4
-; XOP-FALLBACK-NEXT: vpcomgtq %xmm3, %xmm4, %xmm9
+; XOP-FALLBACK-NEXT: vpcomgtq %xmm3, %xmm4, %xmm5
; XOP-FALLBACK-NEXT: vpcomltq %xmm3, %xmm4, %xmm6
; XOP-FALLBACK-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm6
; XOP-FALLBACK-NEXT: vpcomltq %xmm1, %xmm0, %xmm7
; XOP-FALLBACK-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7
-; XOP-FALLBACK-NEXT: vblendvpd %xmm9, %xmm4, %xmm3, %xmm3
+; XOP-FALLBACK-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpsubq %xmm6, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1
@@ -551,13 +551,13 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; XOP-FALLBACK-NEXT: vpor %xmm2, %xmm8, %xmm2
; XOP-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm5
-; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm9
+; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1
; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpmuludq %xmm2, %xmm7, %xmm2
; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT: vpor %xmm8, %xmm9, %xmm5
+; XOP-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7
; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
@@ -576,12 +576,12 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
; XOPAVX1-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; XOPAVX1-NEXT: vpcomgtq %xmm3, %xmm4, %xmm9
+; XOPAVX1-NEXT: vpcomgtq %xmm3, %xmm4, %xmm5
; XOPAVX1-NEXT: vpcomltq %xmm3, %xmm4, %xmm6
; XOPAVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm6
; XOPAVX1-NEXT: vpcomltq %xmm1, %xmm0, %xmm7
; XOPAVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7
-; XOPAVX1-NEXT: vblendvpd %xmm9, %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubq %xmm6, %xmm3, %xmm3
; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm1
@@ -591,13 +591,13 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; XOPAVX1-NEXT: vpor %xmm2, %xmm8, %xmm2
; XOPAVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
-; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOPAVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT: vpsrlq $32, %xmm2, %xmm9
+; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; XOPAVX1-NEXT: vpaddq %xmm1, %xmm9, %xmm1
; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; XOPAVX1-NEXT: vpmuludq %xmm2, %xmm7, %xmm2
; XOPAVX1-NEXT: vpsrlq $33, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpor %xmm8, %xmm9, %xmm5
+; XOPAVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsrlq $32, %xmm5, %xmm7
; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
@@ -685,40 +685,40 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm0, %xmm4
-; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8
+; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm6, %xmm7
-; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm5, %xmm2
+; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm8
+; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm8, %xmm2
; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9
; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2
-; AVX1-FALLBACK-NEXT: vblendvpd %xmm2, %xmm5, %xmm6, %xmm2
+; AVX1-FALLBACK-NEXT: vblendvpd %xmm2, %xmm8, %xmm6, %xmm2
; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
; AVX1-FALLBACK-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm3
-; AVX1-FALLBACK-NEXT: vblendvpd %xmm9, %xmm5, %xmm6, %xmm4
+; AVX1-FALLBACK-NEXT: vblendvpd %xmm9, %xmm8, %xmm6, %xmm4
; AVX1-FALLBACK-NEXT: vpsubq %xmm2, %xmm4, %xmm2
-; AVX1-FALLBACK-NEXT: vblendvpd %xmm8, %xmm0, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT: vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpsubq %xmm3, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm2, %xmm3
; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm4
; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm10 = [1,1]
-; AVX1-FALLBACK-NEXT: vpor %xmm10, %xmm8, %xmm7
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm7, %xmm6
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm4, %xmm6
-; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm6, %xmm1
+; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1]
+; AVX1-FALLBACK-NEXT: vpor %xmm6, %xmm5, %xmm5
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm4, %xmm7
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm7, %xmm1
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm4, %xmm4
; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpor %xmm10, %xmm9, %xmm6
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm6, %xmm7
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm3, %xmm7
-; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm7, %xmm2
+; AVX1-FALLBACK-NEXT: vpor %xmm6, %xmm9, %xmm5
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm6
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm3, %xmm6
+; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm6, %xmm2
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm8, %xmm2
; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
@@ -754,12 +754,12 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; XOP-FALLBACK-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm2
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm3
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm4
-; XOP-FALLBACK-NEXT: vpcomgtuq %xmm3, %xmm4, %xmm9
+; XOP-FALLBACK-NEXT: vpcomgtuq %xmm3, %xmm4, %xmm5
; XOP-FALLBACK-NEXT: vpcomltuq %xmm3, %xmm4, %xmm6
; XOP-FALLBACK-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm6
; XOP-FALLBACK-NEXT: vpcomltuq %xmm1, %xmm0, %xmm7
; XOP-FALLBACK-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7
-; XOP-FALLBACK-NEXT: vblendvpd %xmm9, %xmm4, %xmm3, %xmm3
+; XOP-FALLBACK-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpsubq %xmm6, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1
@@ -769,13 +769,13 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; XOP-FALLBACK-NEXT: vpor %xmm2, %xmm8, %xmm2
; XOP-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm5
-; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm9
+; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1
; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpmuludq %xmm2, %xmm7, %xmm2
; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT: vpor %xmm8, %xmm9, %xmm5
+; XOP-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7
; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
@@ -794,12 +794,12 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; XOPAVX1-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; XOPAVX1-NEXT: vpcomgtuq %xmm3, %xmm4, %xmm9
+; XOPAVX1-NEXT: vpcomgtuq %xmm3, %xmm4, %xmm5
; XOPAVX1-NEXT: vpcomltuq %xmm3, %xmm4, %xmm6
; XOPAVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm6
; XOPAVX1-NEXT: vpcomltuq %xmm1, %xmm0, %xmm7
; XOPAVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7
-; XOPAVX1-NEXT: vblendvpd %xmm9, %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubq %xmm6, %xmm3, %xmm3
; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm1
@@ -809,13 +809,13 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; XOPAVX1-NEXT: vpor %xmm2, %xmm8, %xmm2
; XOPAVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
-; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOPAVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT: vpsrlq $32, %xmm2, %xmm9
+; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; XOPAVX1-NEXT: vpaddq %xmm1, %xmm9, %xmm1
; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; XOPAVX1-NEXT: vpmuludq %xmm2, %xmm7, %xmm2
; XOPAVX1-NEXT: vpsrlq $33, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpor %xmm8, %xmm9, %xmm5
+; XOPAVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsrlq $32, %xmm5, %xmm7
; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
@@ -913,25 +913,25 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; AVX1-FALLBACK-NEXT: vpsubq %xmm6, %xmm4, %xmm4
; AVX1-FALLBACK-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: vpsubq %xmm7, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm4, %xmm9
+; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm4, %xmm6
; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm0, %xmm7
; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; AVX1-FALLBACK-NEXT: vpor %xmm3, %xmm8, %xmm3
; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm6
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm7, %xmm6
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm9
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm9, %xmm0
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm7, %xmm3
; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm4, %xmm4
; AVX1-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm6
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm9, %xmm6
-; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm6, %xmm4
+; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
+; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm7, %xmm4
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm9, %xmm5
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2
; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
@@ -966,12 +966,12 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm2
; XOP-FALLBACK-NEXT: vpcomgtq %xmm0, %xmm1, %xmm3
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm4
-; XOP-FALLBACK-NEXT: vpcomgtq %xmm4, %xmm2, %xmm9
+; XOP-FALLBACK-NEXT: vpcomgtq %xmm4, %xmm2, %xmm5
; XOP-FALLBACK-NEXT: vpcomltq %xmm4, %xmm2, %xmm6
; XOP-FALLBACK-NEXT: vblendvpd %xmm6, %xmm2, %xmm4, %xmm6
; XOP-FALLBACK-NEXT: vpcomltq %xmm0, %xmm1, %xmm7
; XOP-FALLBACK-NEXT: vblendvpd %xmm7, %xmm1, %xmm0, %xmm7
-; XOP-FALLBACK-NEXT: vblendvpd %xmm9, %xmm2, %xmm4, %xmm4
+; XOP-FALLBACK-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm4
; XOP-FALLBACK-NEXT: vpsubq %xmm6, %xmm4, %xmm4
; XOP-FALLBACK-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vpsubq %xmm7, %xmm0, %xmm0
@@ -981,13 +981,13 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; XOP-FALLBACK-NEXT: vpor %xmm3, %xmm8, %xmm3
; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm5
-; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm5, %xmm0
+; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm9
+; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm9, %xmm0
; XOP-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm7, %xmm3
; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm4, %xmm4
-; XOP-FALLBACK-NEXT: vpor %xmm8, %xmm9, %xmm5
+; XOP-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm4, %xmm4
; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7
; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
@@ -1007,12 +1007,12 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm2
; XOPAVX1-NEXT: vpcomgtq %xmm0, %xmm1, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; XOPAVX1-NEXT: vpcomgtq %xmm4, %xmm2, %xmm9
+; XOPAVX1-NEXT: vpcomgtq %xmm4, %xmm2, %xmm5
; XOPAVX1-NEXT: vpcomltq %xmm4, %xmm2, %xmm6
; XOPAVX1-NEXT: vblendvpd %xmm6, %xmm2, %xmm4, %xmm6
; XOPAVX1-NEXT: vpcomltq %xmm0, %xmm1, %xmm7
; XOPAVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm0, %xmm7
-; XOPAVX1-NEXT: vblendvpd %xmm9, %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm4
; XOPAVX1-NEXT: vpsubq %xmm6, %xmm4, %xmm4
; XOPAVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vpsubq %xmm7, %xmm0, %xmm0
@@ -1022,13 +1022,13 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; XOPAVX1-NEXT: vpor %xmm3, %xmm8, %xmm3
; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
-; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOPAVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
+; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm9
+; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; XOPAVX1-NEXT: vpaddq %xmm0, %xmm9, %xmm0
; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0
; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm7, %xmm3
; XOPAVX1-NEXT: vpsrlq $33, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpor %xmm8, %xmm9, %xmm5
+; XOPAVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4
; XOPAVX1-NEXT: vpsrlq $32, %xmm5, %xmm7
; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
@@ -1126,25 +1126,25 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; AVX1-FALLBACK-NEXT: vpsubq %xmm6, %xmm2, %xmm2
; AVX1-FALLBACK-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm2, %xmm9
+; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm2, %xmm6
; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm7
; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; AVX1-FALLBACK-NEXT: vpor %xmm3, %xmm8, %xmm3
; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm6
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm7, %xmm6
-; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm6, %xmm1
+; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm9
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm7, %xmm3
; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm2, %xmm2
; AVX1-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm6
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm9, %xmm6
-; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm6, %xmm2
+; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
+; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm7, %xmm2
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm9, %xmm5
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2
; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2
; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
@@ -1179,12 +1179,12 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm2
; XOP-FALLBACK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm3
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm4
-; XOP-FALLBACK-NEXT: vpcomgtq %xmm2, %xmm4, %xmm9
+; XOP-FALLBACK-NEXT: vpcomgtq %xmm2, %xmm4, %xmm5
; XOP-FALLBACK-NEXT: vpcomltq %xmm2, %xmm4, %xmm6
; XOP-FALLBACK-NEXT: vblendvpd %xmm6, %xmm4, %xmm2, %xmm6
; XOP-FALLBACK-NEXT: vpcomltq %xmm1, %xmm0, %xmm7
; XOP-FALLBACK-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7
-; XOP-FALLBACK-NEXT: vblendvpd %xmm9, %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpsubq %xmm6, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1
@@ -1194,13 +1194,13 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; XOP-FALLBACK-NEXT: vpor %xmm3, %xmm8, %xmm3
; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm5
-; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm9
+; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1
; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm7, %xmm3
; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpor %xmm8, %xmm9, %xmm5
+; XOP-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7
; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
@@ -1220,12 +1220,12 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm2
; XOPAVX1-NEXT: vpcomgtq %xmm1, %xmm0, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; XOPAVX1-NEXT: vpcomgtq %xmm2, %xmm4, %xmm9
+; XOPAVX1-NEXT: vpcomgtq %xmm2, %xmm4, %xmm5
; XOPAVX1-NEXT: vpcomltq %xmm2, %xmm4, %xmm6
; XOPAVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm2, %xmm6
; XOPAVX1-NEXT: vpcomltq %xmm1, %xmm0, %xmm7
; XOPAVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7
-; XOPAVX1-NEXT: vblendvpd %xmm9, %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
; XOPAVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm1
@@ -1235,13 +1235,13 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; XOPAVX1-NEXT: vpor %xmm3, %xmm8, %xmm3
; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
-; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOPAVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm9
+; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; XOPAVX1-NEXT: vpaddq %xmm1, %xmm9, %xmm1
; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm7, %xmm3
; XOPAVX1-NEXT: vpsrlq $33, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpor %xmm8, %xmm9, %xmm5
+; XOPAVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsrlq $32, %xmm5, %xmm7
; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
@@ -1340,25 +1340,25 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX1-FALLBACK-NEXT: vpsubq %xmm6, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vblendvpd %xmm4, %xmm2, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: vpsubq %xmm7, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm9
+; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm6
; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm0, %xmm7
; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; AVX1-FALLBACK-NEXT: vpor %xmm4, %xmm8, %xmm4
; AVX1-FALLBACK-NEXT: vpmuludq %xmm4, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm4, %xmm6
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm7, %xmm6
-; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm4, %xmm9
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm9, %xmm0
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: vpmuludq %xmm4, %xmm7, %xmm4
; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm6
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm9, %xmm6
-; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm6, %xmm1
+; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
+; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm7, %xmm1
; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm9, %xmm5
+; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1
; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm0, %xmm0
@@ -1395,12 +1395,12 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm3
; XOP-FALLBACK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm4
-; XOP-FALLBACK-NEXT: vpcomgtq %xmm2, %xmm3, %xmm9
+; XOP-FALLBACK-NEXT: vpcomgtq %xmm2, %xmm3, %xmm5
; XOP-FALLBACK-NEXT: vpcomltq %xmm2, %xmm3, %xmm6
; XOP-FALLBACK-NEXT: vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
; XOP-FALLBACK-NEXT: vpcomltq %xmm1, %xmm0, %xmm7
; XOP-FALLBACK-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7
-; XOP-FALLBACK-NEXT: vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT: vblendvpd %xmm5, %xmm3, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpsubq %xmm6, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1
@@ -1410,13 +1410,13 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; XOP-FALLBACK-NEXT: vpor %xmm4, %xmm8, %xmm4
; XOP-FALLBACK-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm4, %xmm5
-; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm4, %xmm9
+; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1
; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpmuludq %xmm4, %xmm7, %xmm4
; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpor %xmm8, %xmm9, %xmm5
+; XOP-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7
; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
@@ -1437,12 +1437,12 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm0
; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm3
; XOPAVX1-NEXT: vpcomgtq %xmm1, %xmm0, %xmm4
-; XOPAVX1-NEXT: vpcomgtq %xmm2, %xmm3, %xmm9
+; XOPAVX1-NEXT: vpcomgtq %xmm2, %xmm3, %xmm5
; XOPAVX1-NEXT: vpcomltq %xmm2, %xmm3, %xmm6
; XOPAVX1-NEXT: vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
; XOPAVX1-NEXT: vpcomltq %xmm1, %xmm0, %xmm7
; XOPAVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7
-; XOPAVX1-NEXT: vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
; XOPAVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm1
@@ -1452,13 +1452,13 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
; XOPAVX1-NEXT: vpor %xmm4, %xmm8, %xmm4
; XOPAVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpsrlq $32, %xmm4, %xmm5
-; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOPAVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; XOPAVX1-NEXT: vpsrlq $32, %xmm4, %xmm9
+; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9
+; XOPAVX1-NEXT: vpaddq %xmm1, %xmm9, %xmm1
; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; XOPAVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4
; XOPAVX1-NEXT: vpsrlq $33, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpor %xmm8, %xmm9, %xmm5
+; XOPAVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsrlq $32, %xmm5, %xmm7
; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7
@@ -1700,10 +1700,10 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-FALLBACK-NEXT: vpminuw %xmm2, %xmm3, %xmm4
; AVX1-FALLBACK-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm5
-; AVX1-FALLBACK-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
-; AVX1-FALLBACK-NEXT: vpxor %xmm5, %xmm8, %xmm5
+; AVX1-FALLBACK-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-FALLBACK-NEXT: vpxor %xmm6, %xmm5, %xmm5
; AVX1-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm7
-; AVX1-FALLBACK-NEXT: vpcmpeqw %xmm7, %xmm0, %xmm6
+; AVX1-FALLBACK-NEXT: vpcmpeqw %xmm7, %xmm0, %xmm8
; AVX1-FALLBACK-NEXT: vpxor %xmm6, %xmm8, %xmm6
; AVX1-FALLBACK-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
; AVX1-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
@@ -2317,7 +2317,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX1-FALLBACK: # %bb.0:
; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm8
+; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5
; AVX1-FALLBACK-NEXT: vpminsb %xmm3, %xmm2, %xmm6
; AVX1-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm7
@@ -2333,25 +2333,25 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4
-; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6
+; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm8, %xmm6
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm5
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpand %xmm1, %xmm8, %xmm1
+; AVX1-FALLBACK-NEXT: vpackuswb %xmm6, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm5, %xmm5
+; AVX1-FALLBACK-NEXT: vpand %xmm5, %xmm8, %xmm5
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-FALLBACK-NEXT: vpackuswb %xmm5, %xmm3, %xmm3
; AVX1-FALLBACK-NEXT: vpaddb %xmm2, %xmm3, %xmm2
; AVX1-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2383,7 +2383,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; XOP-FALLBACK: # %bb.0:
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8
+; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4
; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5
; XOP-FALLBACK-NEXT: vpminsb %xmm2, %xmm3, %xmm6
; XOP-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm7
@@ -2397,21 +2397,21 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; XOP-FALLBACK-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm6
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm6, %xmm6
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2421,7 +2421,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8
+; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4
; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5
; XOPAVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6
; XOPAVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm7
@@ -2435,21 +2435,21 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOPAVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; XOPAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpor %xmm7, %xmm8, %xmm6
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpor %xmm7, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; XOPAVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; XOPAVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2540,10 +2540,10 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX1-FALLBACK-NEXT: vpminub %xmm3, %xmm2, %xmm4
; AVX1-FALLBACK-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm5
; AVX1-FALLBACK-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-FALLBACK-NEXT: vpxor %xmm6, %xmm5, %xmm8
-; AVX1-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm7
-; AVX1-FALLBACK-NEXT: vpcmpeqb %xmm7, %xmm0, %xmm5
; AVX1-FALLBACK-NEXT: vpxor %xmm6, %xmm5, %xmm5
+; AVX1-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm7
+; AVX1-FALLBACK-NEXT: vpcmpeqb %xmm7, %xmm0, %xmm8
+; AVX1-FALLBACK-NEXT: vpxor %xmm6, %xmm8, %xmm6
; AVX1-FALLBACK-NEXT: vpmaxub %xmm3, %xmm2, %xmm3
; AVX1-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
; AVX1-FALLBACK-NEXT: vpsubb %xmm7, %xmm1, %xmm1
@@ -2554,26 +2554,26 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX1-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-FALLBACK-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-FALLBACK-NEXT: vpand %xmm7, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm6, %xmm6
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm8, %xmm4
+; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-FALLBACK-NEXT: vpand %xmm4, %xmm8, %xmm4
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT: vpand %xmm7, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT: vpand %xmm1, %xmm8, %xmm1
; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpor %xmm6, %xmm8, %xmm5
+; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5
; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT: vpand %xmm7, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpand %xmm4, %xmm8, %xmm4
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT: vpand %xmm3, %xmm8, %xmm3
; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
; AVX1-FALLBACK-NEXT: vpaddb %xmm2, %xmm3, %xmm2
; AVX1-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
@@ -2608,7 +2608,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; XOP-FALLBACK: # %bb.0:
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOP-FALLBACK-NEXT: vpcomgtub %xmm2, %xmm3, %xmm8
+; XOP-FALLBACK-NEXT: vpcomgtub %xmm2, %xmm3, %xmm4
; XOP-FALLBACK-NEXT: vpcomgtub %xmm1, %xmm0, %xmm5
; XOP-FALLBACK-NEXT: vpminub %xmm2, %xmm3, %xmm6
; XOP-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm7
@@ -2622,21 +2622,21 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; XOP-FALLBACK-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm6
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm6, %xmm6
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2646,7 +2646,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vpcomgtub %xmm2, %xmm3, %xmm8
+; XOPAVX1-NEXT: vpcomgtub %xmm2, %xmm3, %xmm4
; XOPAVX1-NEXT: vpcomgtub %xmm1, %xmm0, %xmm5
; XOPAVX1-NEXT: vpminub %xmm2, %xmm3, %xmm6
; XOPAVX1-NEXT: vpminub %xmm1, %xmm0, %xmm7
@@ -2660,21 +2660,21 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOPAVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; XOPAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpor %xmm7, %xmm8, %xmm6
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpor %xmm7, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; XOPAVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; XOPAVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2767,7 +2767,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
; AVX1-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm8
+; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm5
; AVX1-FALLBACK-NEXT: vpminsb %xmm3, %xmm2, %xmm6
; AVX1-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm7
@@ -2783,25 +2783,25 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4
-; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6
+; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm8, %xmm6
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm5
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpand %xmm0, %xmm8, %xmm0
+; AVX1-FALLBACK-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm5, %xmm5
+; AVX1-FALLBACK-NEXT: vpand %xmm5, %xmm8, %xmm5
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-FALLBACK-NEXT: vpackuswb %xmm5, %xmm3, %xmm3
; AVX1-FALLBACK-NEXT: vpaddb %xmm2, %xmm3, %xmm2
; AVX1-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2835,7 +2835,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm3
-; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8
+; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4
; XOP-FALLBACK-NEXT: vpcomgtb %xmm0, %xmm1, %xmm5
; XOP-FALLBACK-NEXT: vpminsb %xmm2, %xmm3, %xmm6
; XOP-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm7
@@ -2849,21 +2849,21 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; XOP-FALLBACK-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm6
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm0, %xmm0
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm6, %xmm6
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2874,7 +2874,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1
; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm3
-; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8
+; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4
; XOPAVX1-NEXT: vpcomgtb %xmm0, %xmm1, %xmm5
; XOPAVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6
; XOPAVX1-NEXT: vpminsb %xmm0, %xmm1, %xmm7
@@ -2888,21 +2888,21 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOPAVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; XOPAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpor %xmm7, %xmm8, %xmm6
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpor %xmm7, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; XOPAVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; XOPAVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2995,7 +2995,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm2
; AVX1-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm8
+; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm4
; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm5
; AVX1-FALLBACK-NEXT: vpminsb %xmm3, %xmm1, %xmm6
; AVX1-FALLBACK-NEXT: vpminsb %xmm2, %xmm0, %xmm7
@@ -3011,25 +3011,25 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4
-; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6
+; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm8, %xmm6
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm5
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpand %xmm2, %xmm8, %xmm2
+; AVX1-FALLBACK-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm5, %xmm5
+; AVX1-FALLBACK-NEXT: vpand %xmm5, %xmm8, %xmm5
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-FALLBACK-NEXT: vpackuswb %xmm5, %xmm3, %xmm3
; AVX1-FALLBACK-NEXT: vpaddb %xmm1, %xmm3, %xmm1
; AVX1-FALLBACK-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -3063,7 +3063,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm2
; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8
+; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4
; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5
; XOP-FALLBACK-NEXT: vpminsb %xmm2, %xmm3, %xmm6
; XOP-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm7
@@ -3077,21 +3077,21 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; XOP-FALLBACK-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm6
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm6, %xmm6
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -3102,7 +3102,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1
; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8
+; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4
; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5
; XOPAVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6
; XOPAVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm7
@@ -3116,21 +3116,21 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOPAVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; XOPAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpor %xmm7, %xmm8, %xmm6
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpor %xmm7, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; XOPAVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; XOPAVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -3224,7 +3224,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX1-FALLBACK-NEXT: vmovdqa 16(%rsi), %xmm3
; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm8
+; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm4
; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm5
; AVX1-FALLBACK-NEXT: vpminsb %xmm3, %xmm1, %xmm6
; AVX1-FALLBACK-NEXT: vpminsb %xmm2, %xmm0, %xmm7
@@ -3240,25 +3240,25 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4
-; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6
+; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm8, %xmm6
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm5
-; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpand %xmm2, %xmm8, %xmm2
+; AVX1-FALLBACK-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4
+; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm5, %xmm5
+; AVX1-FALLBACK-NEXT: vpand %xmm5, %xmm8, %xmm5
; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-FALLBACK-NEXT: vpackuswb %xmm5, %xmm3, %xmm3
; AVX1-FALLBACK-NEXT: vpaddb %xmm1, %xmm3, %xmm1
; AVX1-FALLBACK-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -3294,7 +3294,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-FALLBACK-NEXT: vmovdqa 16(%rsi), %xmm1
; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm2
; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm3
-; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm3, %xmm8
+; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm3, %xmm4
; XOP-FALLBACK-NEXT: vpcomgtb %xmm0, %xmm2, %xmm5
; XOP-FALLBACK-NEXT: vpminsb %xmm1, %xmm3, %xmm6
; XOP-FALLBACK-NEXT: vpminsb %xmm0, %xmm2, %xmm7
@@ -3308,21 +3308,21 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; XOP-FALLBACK-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm6
-; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm0, %xmm0
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4
+; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm6, %xmm6
; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -3334,7 +3334,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOPAVX1-NEXT: vmovdqa 16(%rsi), %xmm1
; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm2
; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm3
-; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm3, %xmm8
+; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm3, %xmm4
; XOPAVX1-NEXT: vpcomgtb %xmm0, %xmm2, %xmm5
; XOPAVX1-NEXT: vpminsb %xmm1, %xmm3, %xmm6
; XOPAVX1-NEXT: vpminsb %xmm0, %xmm2, %xmm7
@@ -3348,21 +3348,21 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOPAVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; XOPAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpor %xmm7, %xmm8, %xmm6
-; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOPAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpor %xmm7, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; XOPAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; XOPAVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; XOPAVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
; XOPAVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/misched-matmul.ll b/llvm/test/CodeGen/X86/misched-matmul.ll
index 6e9f8fca180e7..a6c489dcb3dad 100644
--- a/llvm/test/CodeGen/X86/misched-matmul.ll
+++ b/llvm/test/CodeGen/X86/misched-matmul.ll
@@ -10,7 +10,7 @@
; more complex cases.
;
; CHECK: @wrap_mul4
-; CHECK: 25 regalloc - Number of spills inserted
+; CHECK: 24 regalloc - Number of spills inserted
define void @wrap_mul4(ptr nocapture %Out, ptr nocapture %A, ptr nocapture %B) #0 {
entry:
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index 8c6e0fdda194e..a6116175226ea 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -426,19 +426,19 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind {
;
; X64-LABEL: test3:
; X64: # %bb.0: # %entry
-; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: testl %edx, %edx
; X64-NEXT: je .LBB3_2
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB3_1: # %bb26
; X64-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NEXT: movslq %r8d, %r8
-; X64-NEXT: movq (%rdi,%r8,8), %rcx
-; X64-NEXT: addq (%rsi,%r8,8), %rcx
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: incl %r8d
-; X64-NEXT: cmpl %edx, %r8d
+; X64-NEXT: movslq %ecx, %rcx
+; X64-NEXT: movq (%rdi,%rcx,8), %r8
+; X64-NEXT: addq (%rsi,%rcx,8), %r8
+; X64-NEXT: addq %r8, %rax
+; X64-NEXT: incl %ecx
+; X64-NEXT: cmpl %edx, %ecx
; X64-NEXT: jb .LBB3_1
; X64-NEXT: .LBB3_2: # %bb31
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/mul-constant-result.ll b/llvm/test/CodeGen/X86/mul-constant-result.ll
index a8e599f128da2..59605ff5d843c 100644
--- a/llvm/test/CodeGen/X86/mul-constant-result.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-result.ll
@@ -898,21 +898,21 @@ define i32 @foo() local_unnamed_addr #0 {
; X64-HSW-NEXT: movl $5, %edi
; X64-HSW-NEXT: movl $2, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %r14d
-; X64-HSW-NEXT: xorl $5, %r14d
+; X64-HSW-NEXT: movl %eax, %ebp
+; X64-HSW-NEXT: xorl $5, %ebp
; X64-HSW-NEXT: movl $6, %edi
; X64-HSW-NEXT: movl $3, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebp
-; X64-HSW-NEXT: xorl $6, %ebp
-; X64-HSW-NEXT: orl %r14d, %ebp
+; X64-HSW-NEXT: movl %eax, %r14d
+; X64-HSW-NEXT: xorl $6, %r14d
+; X64-HSW-NEXT: orl %ebp, %r14d
; X64-HSW-NEXT: movl $7, %edi
; X64-HSW-NEXT: movl $3, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %r14d
-; X64-HSW-NEXT: xorl $7, %r14d
-; X64-HSW-NEXT: orl %ebp, %r14d
-; X64-HSW-NEXT: orl %ebx, %r14d
+; X64-HSW-NEXT: movl %eax, %ebp
+; X64-HSW-NEXT: xorl $7, %ebp
+; X64-HSW-NEXT: orl %r14d, %ebp
+; X64-HSW-NEXT: orl %ebx, %ebp
; X64-HSW-NEXT: movl $8, %edi
; X64-HSW-NEXT: movl $4, %esi
; X64-HSW-NEXT: callq mult at PLT
@@ -921,88 +921,88 @@ define i32 @foo() local_unnamed_addr #0 {
; X64-HSW-NEXT: movl $9, %edi
; X64-HSW-NEXT: movl $4, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebp
-; X64-HSW-NEXT: xorl $9, %ebp
-; X64-HSW-NEXT: orl %ebx, %ebp
+; X64-HSW-NEXT: movl %eax, %r14d
+; X64-HSW-NEXT: xorl $9, %r14d
+; X64-HSW-NEXT: orl %ebx, %r14d
; X64-HSW-NEXT: movl $10, %edi
; X64-HSW-NEXT: movl $5, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebx
-; X64-HSW-NEXT: xorl $10, %ebx
-; X64-HSW-NEXT: orl %ebp, %ebx
+; X64-HSW-NEXT: movl %eax, %r15d
+; X64-HSW-NEXT: xorl $10, %r15d
+; X64-HSW-NEXT: orl %r14d, %r15d
; X64-HSW-NEXT: movl $11, %edi
; X64-HSW-NEXT: movl $5, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %r15d
-; X64-HSW-NEXT: xorl $11, %r15d
-; X64-HSW-NEXT: orl %ebx, %r15d
-; X64-HSW-NEXT: orl %r14d, %r15d
+; X64-HSW-NEXT: movl %eax, %ebx
+; X64-HSW-NEXT: xorl $11, %ebx
+; X64-HSW-NEXT: orl %r15d, %ebx
+; X64-HSW-NEXT: orl %ebp, %ebx
; X64-HSW-NEXT: movl $12, %edi
; X64-HSW-NEXT: movl $6, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebx
-; X64-HSW-NEXT: xorl $12, %ebx
+; X64-HSW-NEXT: movl %eax, %ebp
+; X64-HSW-NEXT: xorl $12, %ebp
; X64-HSW-NEXT: movl $13, %edi
; X64-HSW-NEXT: movl $6, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebp
-; X64-HSW-NEXT: xorl $13, %ebp
-; X64-HSW-NEXT: orl %ebx, %ebp
+; X64-HSW-NEXT: movl %eax, %r14d
+; X64-HSW-NEXT: xorl $13, %r14d
+; X64-HSW-NEXT: orl %ebp, %r14d
; X64-HSW-NEXT: movl $14, %edi
; X64-HSW-NEXT: movl $7, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebx
-; X64-HSW-NEXT: xorl $14, %ebx
-; X64-HSW-NEXT: orl %ebp, %ebx
+; X64-HSW-NEXT: movl %eax, %ebp
+; X64-HSW-NEXT: xorl $14, %ebp
+; X64-HSW-NEXT: orl %r14d, %ebp
; X64-HSW-NEXT: movl $15, %edi
; X64-HSW-NEXT: movl $7, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebp
-; X64-HSW-NEXT: xorl $15, %ebp
-; X64-HSW-NEXT: orl %ebx, %ebp
+; X64-HSW-NEXT: movl %eax, %r14d
+; X64-HSW-NEXT: xorl $15, %r14d
+; X64-HSW-NEXT: orl %ebp, %r14d
; X64-HSW-NEXT: movl $16, %edi
; X64-HSW-NEXT: movl $8, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %r14d
-; X64-HSW-NEXT: xorl $16, %r14d
-; X64-HSW-NEXT: orl %ebp, %r14d
-; X64-HSW-NEXT: orl %r15d, %r14d
+; X64-HSW-NEXT: movl %eax, %ebp
+; X64-HSW-NEXT: xorl $16, %ebp
+; X64-HSW-NEXT: orl %r14d, %ebp
+; X64-HSW-NEXT: orl %ebx, %ebp
; X64-HSW-NEXT: movl $17, %edi
; X64-HSW-NEXT: movl $8, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebp
-; X64-HSW-NEXT: xorl $17, %ebp
+; X64-HSW-NEXT: movl %eax, %ebx
+; X64-HSW-NEXT: xorl $17, %ebx
; X64-HSW-NEXT: movl $18, %edi
; X64-HSW-NEXT: movl $9, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebx
-; X64-HSW-NEXT: xorl $18, %ebx
-; X64-HSW-NEXT: orl %ebp, %ebx
+; X64-HSW-NEXT: movl %eax, %r14d
+; X64-HSW-NEXT: xorl $18, %r14d
+; X64-HSW-NEXT: orl %ebx, %r14d
; X64-HSW-NEXT: movl $19, %edi
; X64-HSW-NEXT: movl $9, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebp
-; X64-HSW-NEXT: xorl $19, %ebp
-; X64-HSW-NEXT: orl %ebx, %ebp
+; X64-HSW-NEXT: movl %eax, %ebx
+; X64-HSW-NEXT: xorl $19, %ebx
+; X64-HSW-NEXT: orl %r14d, %ebx
; X64-HSW-NEXT: movl $20, %edi
; X64-HSW-NEXT: movl $10, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebx
-; X64-HSW-NEXT: xorl $20, %ebx
-; X64-HSW-NEXT: orl %ebp, %ebx
+; X64-HSW-NEXT: movl %eax, %r14d
+; X64-HSW-NEXT: xorl $20, %r14d
+; X64-HSW-NEXT: orl %ebx, %r14d
; X64-HSW-NEXT: movl $21, %edi
; X64-HSW-NEXT: movl $10, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebp
-; X64-HSW-NEXT: xorl $21, %ebp
-; X64-HSW-NEXT: orl %ebx, %ebp
+; X64-HSW-NEXT: movl %eax, %r15d
+; X64-HSW-NEXT: xorl $21, %r15d
+; X64-HSW-NEXT: orl %r14d, %r15d
; X64-HSW-NEXT: movl $22, %edi
; X64-HSW-NEXT: movl $11, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %r15d
-; X64-HSW-NEXT: xorl $22, %r15d
-; X64-HSW-NEXT: orl %ebp, %r15d
-; X64-HSW-NEXT: orl %r14d, %r15d
+; X64-HSW-NEXT: movl %eax, %ebx
+; X64-HSW-NEXT: xorl $22, %ebx
+; X64-HSW-NEXT: orl %r15d, %ebx
+; X64-HSW-NEXT: orl %ebp, %ebx
; X64-HSW-NEXT: movl $23, %edi
; X64-HSW-NEXT: movl $11, %esi
; X64-HSW-NEXT: callq mult at PLT
@@ -1011,58 +1011,58 @@ define i32 @foo() local_unnamed_addr #0 {
; X64-HSW-NEXT: movl $24, %edi
; X64-HSW-NEXT: movl $12, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebx
-; X64-HSW-NEXT: xorl $24, %ebx
-; X64-HSW-NEXT: orl %ebp, %ebx
+; X64-HSW-NEXT: movl %eax, %r14d
+; X64-HSW-NEXT: xorl $24, %r14d
+; X64-HSW-NEXT: orl %ebp, %r14d
; X64-HSW-NEXT: movl $25, %edi
; X64-HSW-NEXT: movl $12, %esi
; X64-HSW-NEXT: callq mult at PLT
; X64-HSW-NEXT: movl %eax, %ebp
; X64-HSW-NEXT: xorl $25, %ebp
-; X64-HSW-NEXT: orl %ebx, %ebp
+; X64-HSW-NEXT: orl %r14d, %ebp
; X64-HSW-NEXT: movl $26, %edi
; X64-HSW-NEXT: movl $13, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebx
-; X64-HSW-NEXT: xorl $26, %ebx
-; X64-HSW-NEXT: orl %ebp, %ebx
+; X64-HSW-NEXT: movl %eax, %r14d
+; X64-HSW-NEXT: xorl $26, %r14d
+; X64-HSW-NEXT: orl %ebp, %r14d
; X64-HSW-NEXT: movl $27, %edi
; X64-HSW-NEXT: movl $13, %esi
; X64-HSW-NEXT: callq mult at PLT
; X64-HSW-NEXT: movl %eax, %ebp
; X64-HSW-NEXT: xorl $27, %ebp
-; X64-HSW-NEXT: orl %ebx, %ebp
+; X64-HSW-NEXT: orl %r14d, %ebp
; X64-HSW-NEXT: movl $28, %edi
; X64-HSW-NEXT: movl $14, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebx
-; X64-HSW-NEXT: xorl $28, %ebx
-; X64-HSW-NEXT: orl %ebp, %ebx
+; X64-HSW-NEXT: movl %eax, %r14d
+; X64-HSW-NEXT: xorl $28, %r14d
+; X64-HSW-NEXT: orl %ebp, %r14d
; X64-HSW-NEXT: movl $29, %edi
; X64-HSW-NEXT: movl $14, %esi
; X64-HSW-NEXT: callq mult at PLT
; X64-HSW-NEXT: movl %eax, %ebp
; X64-HSW-NEXT: xorl $29, %ebp
+; X64-HSW-NEXT: orl %r14d, %ebp
; X64-HSW-NEXT: orl %ebx, %ebp
-; X64-HSW-NEXT: orl %r15d, %ebp
; X64-HSW-NEXT: movl $30, %edi
; X64-HSW-NEXT: movl $15, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %r14d
-; X64-HSW-NEXT: xorl $30, %r14d
+; X64-HSW-NEXT: movl %eax, %ebx
+; X64-HSW-NEXT: xorl $30, %ebx
; X64-HSW-NEXT: movl $31, %edi
; X64-HSW-NEXT: movl $15, %esi
; X64-HSW-NEXT: callq mult at PLT
-; X64-HSW-NEXT: movl %eax, %ebx
-; X64-HSW-NEXT: xorl $31, %ebx
-; X64-HSW-NEXT: orl %r14d, %ebx
-; X64-HSW-NEXT: orl %ebp, %ebx
+; X64-HSW-NEXT: movl %eax, %r14d
+; X64-HSW-NEXT: xorl $31, %r14d
+; X64-HSW-NEXT: orl %ebx, %r14d
+; X64-HSW-NEXT: orl %ebp, %r14d
; X64-HSW-NEXT: movl $32, %edi
; X64-HSW-NEXT: movl $16, %esi
; X64-HSW-NEXT: callq mult at PLT
; X64-HSW-NEXT: xorl $32, %eax
; X64-HSW-NEXT: xorl %ecx, %ecx
-; X64-HSW-NEXT: orl %ebx, %eax
+; X64-HSW-NEXT: orl %r14d, %eax
; X64-HSW-NEXT: setne %cl
; X64-HSW-NEXT: negl %ecx
; X64-HSW-NEXT: movl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll
index faa979ece8e65..015137b3d350c 100644
--- a/llvm/test/CodeGen/X86/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/mul-i1024.ll
@@ -4804,281 +4804,282 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: pushq %rbx
; X64-NEXT: subq $240, %rsp
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rsi, %rbp
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq 40(%rdi), %r15
-; X64-NEXT: movq 32(%rdi), %r9
-; X64-NEXT: movq 56(%rdi), %r8
-; X64-NEXT: movq 48(%rdi), %rbx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq (%rsi), %rdi
-; X64-NEXT: movq 8(%rsi), %r11
-; X64-NEXT: movq %rsi, %r13
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %rdi
+; X64-NEXT: movq 40(%rdi), %r12
+; X64-NEXT: movq 32(%rdi), %r14
+; X64-NEXT: movq 56(%rdi), %r15
+; X64-NEXT: movq 48(%rdi), %r10
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq (%rsi), %r11
+; X64-NEXT: movq 8(%rsi), %r8
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rcx, %r9
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rbp, %r10
-; X64-NEXT: adcq %rsi, %rcx
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %r9, %rcx
+; X64-NEXT: adcq %rsi, %r10
; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %esi
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rcx, %r14
-; X64-NEXT: adcq %rsi, %r8
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movzbl %al, %r9d
; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: addq %r10, %rsi
+; X64-NEXT: adcq %r9, %r13
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rbp, %rbx
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %r15, %rbp
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rbx, %rcx
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq %r12, %rcx
-; X64-NEXT: adcq %r10, %r15
-; X64-NEXT: adcq $0, %r14
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %r13, %rdi
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq 16(%r13), %r10
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: movq %rbp, %r11
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %r9, %r11
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: addq %r11, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r10, %r9
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r9, %rbx
+; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: adcq %rax, %r11
+; X64-NEXT: addq %rdi, %rbx
+; X64-NEXT: adcq %rcx, %r11
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: adcq $0, %r13
; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %r12, %rsi
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq 24(%rdi), %rdi
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq 16(%rbp), %r15
+; X64-NEXT: movq %r14, %r10
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: movq %r12, %rcx
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %rdi, %r14
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq 24(%rbp), %rdi
+; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdi, %r12
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: addq %rsi, %rax
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: adcq %rbx, %rbp
-; X64-NEXT: setb %r9b
-; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %rdi, %rbp
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: adcq %r9, %r12
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rbp, %rbx
-; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %r12, %r9
+; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %rdi
-; X64-NEXT: addq %rcx, %r13
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r15, %rsi
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: addq %rbx, %r8
+; X64-NEXT: movq %r8, (%rsp) # 8-byte Spill
+; X64-NEXT: adcq %r11, %r14
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r9
; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: addq %r14, %rbx
-; X64-NEXT: adcq %r8, %rdi
-; X64-NEXT: setb %r11b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r10
+; X64-NEXT: addq %rsi, %r9
+; X64-NEXT: adcq %r13, %rdi
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r14, %rbp
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %rcx, %r11
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %rbp, %rax
-; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: addq %r11, %rax
+; X64-NEXT: movq %rax, %r11
; X64-NEXT: adcq %rsi, %rcx
; X64-NEXT: setb %sil
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %rbp
; X64-NEXT: addq %rcx, %rax
; X64-NEXT: movzbl %sil, %ecx
; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %rbx, %r15
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rdi, %rbp
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %r11b, %ecx
+; X64-NEXT: addq %r9, %r14
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %rdi, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movzbl %r10b, %ecx
; X64-NEXT: adcq %rcx, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq 16(%rsi), %rcx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq 24(%rsi), %rbx
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
+; X64-NEXT: movq 16(%r8), %rsi
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: mulq %r13
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq 24(%r8), %r14
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r13
+; X64-NEXT: movq %r13, %r15
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r9, %rbp
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %rcx, %r11
; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rsi, %rax
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rbp, %r15
-; X64-NEXT: adcq %rdi, %rcx
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: addq %r11, %rsi
+; X64-NEXT: adcq %rdi, %rbx
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %r13
+; X64-NEXT: movq %r13, %r12
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %rbx, %rdi
+; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: adcq %rax, %rcx
+; X64-NEXT: movq (%r8), %r13
+; X64-NEXT: movq %r13, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rcx, %r12
-; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: movq 8(%r8), %rbp
+; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %r11, %r14
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq %r13, %rax
+; X64-NEXT: movq %r12, %r11
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %rbx, %r12
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r12, %rbx
+; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %r11
-; X64-NEXT: movq (%rsi), %rdi
+; X64-NEXT: addq %r9, %rbx
+; X64-NEXT: adcq %rsi, %r11
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: movq %r13, %r10
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r13, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %rsi, %r14
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: adcq %r9, %r12
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: addq %r12, %rsi
+; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: adcq %rax, %r9
+; X64-NEXT: addq %rbx, %r13
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r11, %r14
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: addq %rdi, %rsi
+; X64-NEXT: adcq %rcx, %r9
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq 8(%rsi), %r9
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: movq %rdx, %r11
; X64-NEXT: movq %rax, %rbx
; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: adcq $0, %r11
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rbp, %rsi
-; X64-NEXT: setb %bl
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rdi, %r8
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rsi, %rcx
-; X64-NEXT: movzbl %bl, %eax
-; X64-NEXT: adcq %rax, %r13
-; X64-NEXT: addq %r14, %rcx
-; X64-NEXT: adcq %r15, %r13
-; X64-NEXT: adcq $0, %r12
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq %rdi, %rbp
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rsi, %rdi
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: adcq %rbx, %rbp
-; X64-NEXT: setb %r15b
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rbp, %rbx
-; X64-NEXT: movzbl %r15b, %eax
-; X64-NEXT: adcq %rax, %r8
-; X64-NEXT: addq %rcx, %r14
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r13, %rdi
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: addq %r12, %rbx
-; X64-NEXT: adcq %r11, %r8
-; X64-NEXT: setb %r14b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: movq %rsi, %r15
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rbp, %r10
-; X64-NEXT: adcq %rdi, %rcx
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movzbl %dil, %ecx
-; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %rbx, %r11
-; X64-NEXT: adcq %r8, %r10
-; X64-NEXT: movzbl %r14b, %ecx
-; X64-NEXT: adcq %rcx, %rax
+; X64-NEXT: addq %rbx, %rcx
+; X64-NEXT: adcq %r11, %r14
+; X64-NEXT: setb %r11b
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: movzbl %r11b, %edi
+; X64-NEXT: adcq %rdi, %rdx
+; X64-NEXT: addq %rsi, %r13
+; X64-NEXT: adcq %r9, %rcx
+; X64-NEXT: movzbl %r10b, %esi
+; X64-NEXT: adcq %rsi, %rax
; X64-NEXT: adcq $0, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq (%rsp), %rax # 8-byte Folded Reload
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -5086,281 +5087,285 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq 32(%rsi), %rdi
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r9, %rbx
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq 32(%rcx), %rdi
+; X64-NEXT: movq %r8, %r10
+; X64-NEXT: movq %r8, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdi, %r11
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq 40(%rsi), %rcx
-; X64-NEXT: movq %rsi, %r13
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rcx, %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rbp, %r9
-; X64-NEXT: adcq %rdi, %rcx
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rcx, %r14
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: adcq %rax, %r10
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %r8
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: movq %r11, (%rsp) # 8-byte Spill
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %rsi, %r11
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq 40(%rcx), %rsi
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rsi, %rdi
+; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %rcx, %rsi
-; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: addq %r11, %rsi
+; X64-NEXT: adcq %r9, %rbx
+; X64-NEXT: setb %r10b
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: addq %rsi, %rax
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %rbx, %r11
+; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: adcq %rax, %r9
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rdi, %rbp
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rbp, %rcx
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: adcq %rax, %r8
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT: adcq %r9, %r8
+; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %rbx, %r13
; X64-NEXT: adcq $0, %r14
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq 48(%r13), %r11
-; X64-NEXT: movq %r12, %rdi
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r12, %rbx
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq 56(%r13), %rsi
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: addq %r13, %rax
+; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT: adcq %r14, %r10
+; X64-NEXT: setb %r15b
+; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: movq %rbp, %rdi
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %r10, %r14
+; X64-NEXT: movzbl %r15b, %eax
+; X64-NEXT: adcq %rax, %rbx
+; X64-NEXT: addq %r8, %r14
+; X64-NEXT: adcq %rsi, %rbx
+; X64-NEXT: adcq $0, %r11
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq %rcx, %r8
+; X64-NEXT: movq 48(%rcx), %rcx
+; X64-NEXT: movq %r12, %r15
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %rbp
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rdi, %r12
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rbx, %r13
-; X64-NEXT: adcq %rbp, %rdi
-; X64-NEXT: setb %bl
+; X64-NEXT: addq %rsi, %r13
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq 56(%r8), %rsi
; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rsi, %r15
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %r13, %rdi
+; X64-NEXT: adcq %r10, %r15
+; X64-NEXT: setb %r8b
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rdi, %r12
-; X64-NEXT: movzbl %bl, %eax
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %r15, %r13
+; X64-NEXT: movzbl %r8b, %eax
; X64-NEXT: adcq %rax, %rsi
-; X64-NEXT: addq %rcx, %r9
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r8, %r13
-; X64-NEXT: adcq $0, %r12
+; X64-NEXT: addq %r14, %rbp
+; X64-NEXT: movq %rbp, %r8
+; X64-NEXT: adcq %rbx, %rdi
+; X64-NEXT: adcq $0, %r13
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: addq %r14, %r12
-; X64-NEXT: adcq %r10, %rsi
-; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r11
+; X64-NEXT: addq %r11, %r13
+; X64-NEXT: adcq %r9, %rsi
+; X64-NEXT: setb %bpl
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r9, %rbp
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rbp, %r9
-; X64-NEXT: adcq %rdi, %rcx
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rcx, %r14
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: adcq %rax, %r10
-; X64-NEXT: addq %r12, %r8
-; X64-NEXT: adcq %rsi, %r9
-; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT: adcq %rax, %r14
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r9, %rbx
; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rbx, %r9
+; X64-NEXT: adcq %r10, %r15
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r15, %rbx
+; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: adcq %rax, %r14
+; X64-NEXT: addq %r13, %r12
+; X64-NEXT: adcq %rsi, %r9
+; X64-NEXT: movzbl %bpl, %eax
+; X64-NEXT: adcq %rax, %rbx
+; X64-NEXT: adcq $0, %r14
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %r8
+; X64-NEXT: adcq %rax, (%rsp) # 8-byte Folded Spill
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r12
; X64-NEXT: adcq $0, %r9
+; X64-NEXT: adcq $0, %rbx
; X64-NEXT: adcq $0, %r14
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq (%rsp), %rsi # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: movq %r9, %rax
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rsi, %r8
+; X64-NEXT: movq %rsi, %r13
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %rcx, %r10
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rbx, %r12
-; X64-NEXT: adcq %rsi, %rcx
-; X64-NEXT: setb %bl
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdi, %r9
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rcx, %r13
-; X64-NEXT: movzbl %bl, %eax
-; X64-NEXT: adcq %rax, %r11
+; X64-NEXT: addq %r10, %r12
+; X64-NEXT: adcq %rsi, %r15
+; X64-NEXT: setb %r8b
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: addq %r15, %rsi
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: adcq %rax, %r9
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %r13
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: mulq %r13
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %r10, %r13
+; X64-NEXT: adcq $0, %r15
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %r8, %r11
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: addq %r13, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: adcq %r15, %r10
+; X64-NEXT: setb %r8b
; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: movq %rbp, %r8
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rbx, %rbp
-; X64-NEXT: setb %bl
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rbp, %rcx
-; X64-NEXT: movzbl %bl, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT: adcq %r12, %r15
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq %r8, %rbx
+; X64-NEXT: movq %rbp, %r15
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: addq %r10, %rbp
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: adcq %rax, %r13
+; X64-NEXT: addq %rdi, %rbp
+; X64-NEXT: adcq %r12, %r13
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq %r11, %r8
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r15, %r12
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %rdi, %r15
+; X64-NEXT: adcq $0, %r10
; X64-NEXT: movq %r8, %rax
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: movq %rsi, %r9
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %r8, %rsi
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rsi, %r8
-; X64-NEXT: adcq %rbp, %rbx
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: addq %r15, %rax
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: adcq %r10, %r8
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rbx, %rbp
-; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: addq %r8, %r12
+; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %rdi
-; X64-NEXT: addq %rcx, %r12
-; X64-NEXT: adcq %r15, %r8
-; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: addq %rbp, %r11
+; X64-NEXT: adcq %r13, %r15
+; X64-NEXT: movq %r15, %rbp
+; X64-NEXT: adcq $0, %r12
; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: addq %r13, %rbp
-; X64-NEXT: adcq %r11, %rdi
+; X64-NEXT: addq %rsi, %r12
+; X64-NEXT: adcq %r9, %rdi
; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %rcx, %rsi
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: movq %r13, %rax
; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r15, %rbx
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rcx, %r8
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: adcq %rsi, %r11
-; X64-NEXT: setb %bl
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: addq %r11, %rax
-; X64-NEXT: movzbl %bl, %ecx
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: addq %r8, %rax
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: adcq %rsi, %rcx
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %r13, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: movzbl %sil, %ecx
; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %rbp, %r13
-; X64-NEXT: adcq %rdi, %r15
+; X64-NEXT: addq %r12, %r10
+; X64-NEXT: adcq %rdi, %r8
; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; X64-NEXT: adcq %rcx, %rax
; X64-NEXT: movq %rax, %rcx
@@ -5369,555 +5374,559 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: adcq %r14, %r12
-; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r10, %r8
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %rbx, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r14, %rbp
+; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT: adcq %rax, %r13
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %rax, %r10
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rcx
; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq 64(%rsi), %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r11
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdi, %r8
+; X64-NEXT: movq 64(%r9), %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rcx, %r15
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rsi, %r8
; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq 72(%rsi), %rcx
-; X64-NEXT: movq %rsi, %r13
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rcx, %rsi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rbp, %r14
-; X64-NEXT: adcq %rdi, %rcx
-; X64-NEXT: setb %bl
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq 72(%r9), %rsi
+; X64-NEXT: movq %r9, %rcx
+; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rsi, %r12
; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rcx, %r15
-; X64-NEXT: movzbl %bl, %eax
-; X64-NEXT: adcq %rax, %r10
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq %r8, %rbx
+; X64-NEXT: adcq %rdi, %r10
+; X64-NEXT: setb %r8b
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rsi, %r13
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %r10, %r9
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: adcq %rax, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rdi, %rsi
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rsi, %rdi
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r12
-; X64-NEXT: addq %r11, %rdi
-; X64-NEXT: adcq %r14, %r12
-; X64-NEXT: adcq $0, %r15
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %r8, %r14
; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq 80(%r13), %r11
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: movq %rbp, %rsi
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq 88(%r13), %r13
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: movq %r12, %rdi
; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %rbp, %rax
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: adcq %rbx, %rcx
-; X64-NEXT: setb %r9b
-; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r10, %r8
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r15, %r12
; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %rcx, %rsi
-; X64-NEXT: movzbl %r9b, %eax
-; X64-NEXT: adcq %rax, %rbx
-; X64-NEXT: addq %rdi, %r14
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r12, %rbp
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: addq %r15, %rsi
-; X64-NEXT: adcq %r10, %rbx
-; X64-NEXT: setb %r9b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r14, %rbp
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %rbp, %rax
; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: adcq %rdi, %rcx
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %r8, %r14
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rcx, %r12
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: adcq %rax, %rdx
-; X64-NEXT: addq %rsi, %r10
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rbx, %rbp
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: addq %r8, %rbp
+; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: adcq %rax, %r14
+; X64-NEXT: addq %r11, %rbp
+; X64-NEXT: adcq %rbx, %r14
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: movq 80(%rcx), %r15
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %r8, %r11
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq 88(%rbx), %rbx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: addq %r11, %rax
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: adcq %r10, %r8
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %r8, %r13
+; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %r12
-; X64-NEXT: adcq $0, %rdx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq (%rsp), %rax # 8-byte Reload
-; X64-NEXT: imulq %rax, %r13
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %r13, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: imulq %rdi, %r11
-; X64-NEXT: addq %rdx, %r11
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: imulq %rbx, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rsi, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: imulq %rbp, %r8
-; X64-NEXT: addq %rdx, %r8
-; X64-NEXT: addq %r10, %r9
-; X64-NEXT: adcq %r11, %r8
-; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: addq %rbp, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r14, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: adcq $0, %r12
+; X64-NEXT: addq %r9, %r13
+; X64-NEXT: adcq %rsi, %r12
+; X64-NEXT: setb %bpl
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: movq %r11, %rax
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq %rbx, %r11
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rbx, %r10
-; X64-NEXT: adcq %rsi, %rbp
-; X64-NEXT: setb %cl
+; X64-NEXT: addq %rdi, %r10
+; X64-NEXT: adcq $0, %r8
; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rbp, %r11
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r13
-; X64-NEXT: addq %r9, %r11
-; X64-NEXT: adcq %r8, %r13
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq 120(%rdx), %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: movq 112(%rdx), %rsi
+; X64-NEXT: movq %r11, %r14
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: imulq %r8, %rsi
-; X64-NEXT: addq %rdx, %rsi
-; X64-NEXT: movq 96(%rdi), %rcx
-; X64-NEXT: movq 104(%rdi), %rbp
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: imulq %rbp, %rdi
-; X64-NEXT: mulq %rcx
+; X64-NEXT: addq %r10, %rax
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: imulq %rcx, %r14
-; X64-NEXT: addq %rdx, %r14
-; X64-NEXT: addq %r15, %r9
-; X64-NEXT: adcq %rsi, %r14
-; X64-NEXT: movq %r14, %r15
+; X64-NEXT: adcq %r8, %rdi
+; X64-NEXT: setb %r8b
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rsi, %rdi
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %rdi, %rsi
-; X64-NEXT: adcq %rbx, %rcx
-; X64-NEXT: setb %bl
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movzbl %bl, %ecx
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: movzbl %r8b, %ecx
; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %r9, %rax
-; X64-NEXT: adcq %r15, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: adcq %r10, %rsi
-; X64-NEXT: adcq %r11, %rax
-; X64-NEXT: adcq %r13, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT: addq %r13, %rsi
; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r12, %rax
+; X64-NEXT: adcq %r12, %r9
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movzbl %bpl, %ecx
+; X64-NEXT: adcq %rcx, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: movq 80(%r9), %rbp
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq 88(%r9), %rbx
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: imulq %rax, %rbx
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rbx, %rdx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: imulq %rcx, %r15
+; X64-NEXT: addq %rdx, %r15
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rdi, %r10
-; X64-NEXT: adcq %rsi, %rbp
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: imulq %rsi, %r10
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %r10, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: imulq %r9, %rbx
+; X64-NEXT: addq %rdx, %rbx
+; X64-NEXT: addq %r8, %rdi
+; X64-NEXT: adcq %r15, %rbx
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %r8, %r15
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq %r9, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rcx, %rbx
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rbp, %r11
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: adcq %rax, %r8
-; X64-NEXT: movq 64(%r9), %r13
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: movq %r14, %rdi
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq 72(%r9), %rax
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r14, %rbp
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq %r15, %r11
+; X64-NEXT: adcq %r10, %r8
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: addq %rbp, %rax
-; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT: adcq %rdi, %rsi
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rsi, %r12
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r14
-; X64-NEXT: addq %r15, %r12
-; X64-NEXT: adcq %r10, %r14
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %r8, %r15
+; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: adcq %rax, %rsi
+; X64-NEXT: addq %rdi, %r15
+; X64-NEXT: adcq %rbx, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq 120(%rdx), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: imulq %rax, %rdi
+; X64-NEXT: movq 112(%rdx), %rbx
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rax, %rbp
; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rdi, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: imulq %rcx, %rbx
+; X64-NEXT: addq %rdx, %rbx
+; X64-NEXT: movq 96(%r12), %r10
+; X64-NEXT: movq 104(%r12), %rdi
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %r14, %r12
+; X64-NEXT: imulq %rdi, %r12
+; X64-NEXT: mulq %r10
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %r12, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: imulq %r10, %r14
+; X64-NEXT: addq %rdx, %r14
+; X64-NEXT: addq %r8, %r13
+; X64-NEXT: adcq %rbx, %r14
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %r9, %r15
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: addq %r8, %r12
+; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r12, %rbx
+; X64-NEXT: adcq %rbp, %r10
+; X64-NEXT: setb %r8b
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: addq %r10, %rax
+; X64-NEXT: movzbl %r8b, %edi
+; X64-NEXT: adcq %rdi, %rdx
+; X64-NEXT: addq %r13, %rax
+; X64-NEXT: adcq %r14, %rdx
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT: adcq %r11, %rbx
+; X64-NEXT: adcq %r15, %rax
+; X64-NEXT: adcq %rsi, %rdx
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rbx
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: movq 80(%r13), %r8
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq 88(%r13), %r11
+; X64-NEXT: movq %r13, %r10
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rsi, %r9
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rbp, %rcx
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %rcx, %rdi
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %rdi, %r14
+; X64-NEXT: adcq %rsi, %rcx
+; X64-NEXT: setb %dil
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %r8, %r11
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: addq %rcx, %rsi
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: adcq %rax, %r13
+; X64-NEXT: movq %r10, %rdi
+; X64-NEXT: movq 64(%r10), %r10
+; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: adcq %rsi, %rdi
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq 72(%rdi), %rax
+; X64-NEXT: movq %rax, %r8
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rdi, %r13
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq %r12, %r10
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r14, %rbp
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: addq %r11, %r13
-; X64-NEXT: adcq %r8, %r15
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r11
; X64-NEXT: movq %rax, %r12
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: addq %rcx, %r12
+; X64-NEXT: adcq $0, %r15
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %r11, %rcx
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: movq %r11, %r9
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: addq %r12, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r15, %rcx
+; X64-NEXT: setb %dil
+; X64-NEXT: movq %r8, %r11
+; X64-NEXT: movq %r8, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: adcq %rsi, %r14
-; X64-NEXT: setb %sil
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: addq %rcx, %rbp
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: adcq %rax, %r12
+; X64-NEXT: addq %rbx, %rbp
+; X64-NEXT: adcq %r14, %r12
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: movq %r10, %rdi
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r9
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %r11, %rbx
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %rcx, %r14
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: addq %r14, %rax
-; X64-NEXT: movzbl %sil, %esi
-; X64-NEXT: adcq %rsi, %rdx
-; X64-NEXT: addq %r13, %r12
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: adcq %r10, %rcx
+; X64-NEXT: setb %dil
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %rcx, %r14
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: adcq %rax, %r10
+; X64-NEXT: addq %rbp, %r9
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r12, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r14
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: addq %rsi, %r14
+; X64-NEXT: adcq %r13, %r10
+; X64-NEXT: setb %dil
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rcx, %r9
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: addq %r9, %rax
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: adcq %rsi, %rcx
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: movzbl %sil, %ecx
+; X64-NEXT: adcq %rcx, %rdx
+; X64-NEXT: addq %r14, %r12
; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r15, %rcx
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %r8b, %ecx
+; X64-NEXT: adcq %r10, %r9
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movzbl %dil, %ecx
; X64-NEXT: adcq %rcx, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: movq 96(%rbp), %rsi
-; X64-NEXT: imulq %rsi, %r9
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq 96(%rdi), %rsi
+; X64-NEXT: imulq %rsi, %r15
; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %r9, %rdx
-; X64-NEXT: movq 104(%rbp), %r15
-; X64-NEXT: imulq %r15, %rbx
-; X64-NEXT: addq %rdx, %rbx
-; X64-NEXT: movq %rbx, %r9
-; X64-NEXT: movq 112(%rbp), %rax
-; X64-NEXT: movq %rbp, %rdi
+; X64-NEXT: movq %r8, %rcx
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %r15, %rdx
+; X64-NEXT: movq 104(%rdi), %r9
+; X64-NEXT: imulq %r9, %rcx
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movq 112(%rdi), %rax
; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: imulq %rbx, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: imulq %r12, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rax, %r8
; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: movq 120(%rdi), %r8
-; X64-NEXT: imulq %rbp, %r8
-; X64-NEXT: addq %rdx, %r8
-; X64-NEXT: addq %r14, %r13
-; X64-NEXT: adcq %r9, %r8
-; X64-NEXT: movq %rbp, %rax
+; X64-NEXT: movq 120(%rdi), %rdi
+; X64-NEXT: imulq %r15, %rdi
+; X64-NEXT: addq %rdx, %rdi
+; X64-NEXT: addq %r10, %r8
+; X64-NEXT: adcq %r14, %rdi
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %r14, %rcx
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rsi, %rdi
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rdi, %r15
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r12
-; X64-NEXT: addq %r13, %r15
-; X64-NEXT: adcq %r8, %r12
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT: imulq %r11, %rcx
-; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %r10, %r13
+; X64-NEXT: adcq $0, %r14
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: addq %r13, %rbp
+; X64-NEXT: adcq %r14, %rcx
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %rcx, %r14
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: adcq %rax, %r10
+; X64-NEXT: addq %r8, %r14
+; X64-NEXT: adcq %rdi, %r10
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: imulq %r15, %rdi
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: imulq %r14, %rsi
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %rdi, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: imulq %r12, %rsi
; X64-NEXT: addq %rdx, %rsi
-; X64-NEXT: movq %rsi, %r9
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: imulq %rbp, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rsi, %r8
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %r11, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: imulq %r9, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rax, %r13
; X64-NEXT: addq %rsi, %rdx
-; X64-NEXT: imulq %rdi, %r10
-; X64-NEXT: addq %rdx, %r10
-; X64-NEXT: addq %r8, %r13
-; X64-NEXT: adcq %r9, %r10
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %r10
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rsi, %rdi
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: imulq %r11, %rbx
+; X64-NEXT: addq %rdx, %rbx
+; X64-NEXT: addq %rcx, %r13
+; X64-NEXT: adcq %r8, %rbx
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %rdi, %rsi
-; X64-NEXT: adcq %r9, %rcx
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movzbl %dil, %ecx
-; X64-NEXT: adcq %rcx, %rdx
+; X64-NEXT: addq %r8, %rsi
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %rsi, %r11
+; X64-NEXT: adcq %rdi, %r8
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: addq %r8, %rax
+; X64-NEXT: movzbl %sil, %esi
+; X64-NEXT: adcq %rsi, %rdx
; X64-NEXT: addq %r13, %rax
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: adcq %r15, %rax
-; X64-NEXT: adcq %r12, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT: adcq %rbx, %rdx
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: adcq %rbp, %r11
+; X64-NEXT: adcq %r14, %rax
+; X64-NEXT: adcq %r10, %rdx
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT: movq (%rsp), %rdi # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT: movq %rcx, %r9
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT: movq %rsi, %r8
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT: movq %rdi, %r10
+; X64-NEXT: movq %rdi, %r9
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, (%rcx)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 8(%rcx)
+; X64-NEXT: movq %rdi, (%rsi)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 16(%rcx)
+; X64-NEXT: movq %rdi, 8(%rsi)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 24(%rcx)
+; X64-NEXT: movq %rdi, 16(%rsi)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 32(%rcx)
+; X64-NEXT: movq %rdi, 24(%rsi)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 40(%rcx)
+; X64-NEXT: movq %rdi, 32(%rsi)
+; X64-NEXT: movq (%rsp), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 40(%rsi)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 48(%rcx)
+; X64-NEXT: movq %rdi, 48(%rsi)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 56(%rcx)
-; X64-NEXT: movq %r9, 64(%rcx)
-; X64-NEXT: movq %r10, 72(%rcx)
-; X64-NEXT: movq %rbx, 80(%rcx)
-; X64-NEXT: movq %rbp, 88(%rcx)
-; X64-NEXT: movq %r8, 96(%rcx)
-; X64-NEXT: movq %rsi, 104(%rcx)
-; X64-NEXT: movq %rax, 112(%rcx)
-; X64-NEXT: movq %rdx, 120(%rcx)
+; X64-NEXT: movq %rdi, 56(%rsi)
+; X64-NEXT: movq %r8, 64(%rsi)
+; X64-NEXT: movq %r9, 72(%rsi)
+; X64-NEXT: movq %r10, 80(%rsi)
+; X64-NEXT: movq %rbx, 88(%rsi)
+; X64-NEXT: movq %rcx, 96(%rsi)
+; X64-NEXT: movq %r11, 104(%rsi)
+; X64-NEXT: movq %rax, 112(%rsi)
+; X64-NEXT: movq %rdx, 120(%rsi)
; X64-NEXT: addq $240, %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll
index f5d83d090de48..9382278ff5c2d 100644
--- a/llvm/test/CodeGen/X86/mul-i256.ll
+++ b/llvm/test/CodeGen/X86/mul-i256.ll
@@ -312,71 +312,66 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: pushq %r14
; X64-NEXT: .cfi_def_cfa_offset 24
-; X64-NEXT: pushq %r12
-; X64-NEXT: .cfi_def_cfa_offset 32
; X64-NEXT: pushq %rbx
-; X64-NEXT: .cfi_def_cfa_offset 40
-; X64-NEXT: .cfi_offset %rbx, -40
-; X64-NEXT: .cfi_offset %r12, -32
+; X64-NEXT: .cfi_def_cfa_offset 32
+; X64-NEXT: .cfi_offset %rbx, -32
; X64-NEXT: .cfi_offset %r14, -24
; X64-NEXT: .cfi_offset %r15, -16
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq (%rdi), %r14
-; X64-NEXT: movq 8(%rdi), %r8
-; X64-NEXT: movq 16(%rdi), %rcx
-; X64-NEXT: movq 16(%rsi), %rbx
-; X64-NEXT: movq (%rsi), %r12
-; X64-NEXT: movq 8(%rsi), %r15
-; X64-NEXT: movq 24(%rdi), %rdi
-; X64-NEXT: imulq %r12, %rdi
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: imulq %r15, %rcx
-; X64-NEXT: addq %rdx, %rcx
-; X64-NEXT: movq %rbx, %rdi
-; X64-NEXT: imulq %r8, %rdi
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq (%rdi), %rbx
+; X64-NEXT: movq 8(%rdi), %r11
+; X64-NEXT: movq 16(%rdi), %r10
+; X64-NEXT: movq 16(%rsi), %r8
+; X64-NEXT: movq (%rsi), %r9
+; X64-NEXT: movq 8(%rsi), %r14
+; X64-NEXT: movq 24(%rdi), %r15
+; X64-NEXT: imulq %r9, %r15
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r10
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %r15, %rdx
+; X64-NEXT: imulq %r14, %r10
+; X64-NEXT: addq %rdx, %r10
+; X64-NEXT: movq %r8, %r15
+; X64-NEXT: imulq %r11, %r15
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %r15, %rdx
+; X64-NEXT: movq 24(%rsi), %r15
+; X64-NEXT: imulq %rbx, %r15
+; X64-NEXT: addq %rdx, %r15
+; X64-NEXT: addq %rdi, %r8
+; X64-NEXT: adcq %r10, %r15
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: movq 24(%rsi), %rbx
-; X64-NEXT: imulq %r14, %rbx
-; X64-NEXT: addq %rdx, %rbx
-; X64-NEXT: addq %r10, %r11
-; X64-NEXT: adcq %rcx, %rbx
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r10
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rsi, %rdi
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r15
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rsi, %r9
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %r14
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rdi, %r14
-; X64-NEXT: adcq %rcx, %rsi
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r9, %rbx
+; X64-NEXT: adcq %rdi, %rsi
; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %ecx
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r15
+; X64-NEXT: movzbl %al, %edi
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r14
; X64-NEXT: addq %rsi, %rax
-; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %r11, %rax
-; X64-NEXT: adcq %rbx, %rdx
-; X64-NEXT: movq %r10, (%r9)
-; X64-NEXT: movq %r14, 8(%r9)
-; X64-NEXT: movq %rax, 16(%r9)
-; X64-NEXT: movq %rdx, 24(%r9)
+; X64-NEXT: adcq %rdi, %rdx
+; X64-NEXT: addq %r8, %rax
+; X64-NEXT: adcq %r15, %rdx
+; X64-NEXT: movq %r10, (%rcx)
+; X64-NEXT: movq %rbx, 8(%rcx)
+; X64-NEXT: movq %rax, 16(%rcx)
+; X64-NEXT: movq %rdx, 24(%rcx)
; X64-NEXT: popq %rbx
-; X64-NEXT: .cfi_def_cfa_offset 32
-; X64-NEXT: popq %r12
; X64-NEXT: .cfi_def_cfa_offset 24
; X64-NEXT: popq %r14
; X64-NEXT: .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index 5781e339929c3..08d0f7cd08220 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -1179,267 +1179,271 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: pushq %rax
+; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq (%rdi), %r9
-; X64-NEXT: movq 8(%rdi), %r8
-; X64-NEXT: movq 24(%rdi), %r15
-; X64-NEXT: movq 16(%rdi), %rax
-; X64-NEXT: movq (%rsi), %rdi
-; X64-NEXT: movq 8(%rsi), %r14
-; X64-NEXT: movq %rsi, %r12
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r14
+; X64-NEXT: movq (%rdi), %rbx
+; X64-NEXT: movq 8(%rdi), %r9
+; X64-NEXT: movq 24(%rdi), %r12
+; X64-NEXT: movq 16(%rdi), %r14
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq (%rsi), %rcx
+; X64-NEXT: movq 8(%rsi), %r11
+; X64-NEXT: movq %rsi, %rdi
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %rcx, %rsi
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rsi, %r15
+; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rbx, %r10
-; X64-NEXT: adcq %rbp, %rcx
+; X64-NEXT: addq %rcx, %r10
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %r10, %rcx
+; X64-NEXT: adcq %r8, %r14
; X64-NEXT: setb %al
; X64-NEXT: movzbl %al, %esi
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %r14, %r15
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rcx, %r14
-; X64-NEXT: adcq %rsi, %rdx
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %r14, %r10
+; X64-NEXT: adcq %rsi, %r13
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: adcq $0, %rbx
; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %r9, %rdi
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %rbp, %rax
+; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %r14, %r15
+; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: addq %r15, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rbx, %rcx
+; X64-NEXT: adcq %rbp, %rbx
; X64-NEXT: setb %sil
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: addq %rbx, %rbp
; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq %r11, %rbx
-; X64-NEXT: adcq %r10, %r15
-; X64-NEXT: adcq $0, %r14
+; X64-NEXT: adcq %rax, %r14
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: adcq %rcx, %r14
+; X64-NEXT: adcq $0, %r10
; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq 16(%r12), %r9
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq %rdi, %rsi
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %r8, %r11
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r9
+; X64-NEXT: movq 16(%rdi), %r8
+; X64-NEXT: movq %r12, %r11
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r10, %rbp
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq 24(%r12), %r8
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %r9, %r12
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: addq %rbp, %rax
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: adcq %rcx, %rsi
-; X64-NEXT: setb %cl
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %rcx, %r15
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq 24(%rsi), %rsi
; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %r15, %r11
+; X64-NEXT: adcq %rbx, %r9
+; X64-NEXT: setb %bl
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %r9, %rcx
+; X64-NEXT: movzbl %bl, %eax
+; X64-NEXT: adcq %rax, %r15
+; X64-NEXT: addq %rbp, %rdi
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r14, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: adcq $0, %r15
+; X64-NEXT: addq %r10, %rcx
+; X64-NEXT: adcq %r13, %r15
+; X64-NEXT: setb %r12b
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rsi, %r12
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r11
-; X64-NEXT: addq %rbx, %r13
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r15, %rbp
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %r12
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: addq %r14, %r12
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT: setb %r15b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %r11
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %rbx, %rax
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %rbp, %rax
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: adcq %rsi, %rcx
-; X64-NEXT: setb %bl
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %rdi, %rbx
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: adcq %r9, %rbp
+; X64-NEXT: setb %dil
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rcx, %r14
-; X64-NEXT: movzbl %bl, %eax
-; X64-NEXT: adcq %rax, %rdx
-; X64-NEXT: addq %r12, %rdi
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r11, %rbp
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %r15b, %eax
-; X64-NEXT: adcq %rax, %r14
+; X64-NEXT: mulq %rsi
+; X64-NEXT: addq %rbp, %rax
+; X64-NEXT: movzbl %dil, %edi
+; X64-NEXT: adcq %rdi, %rdx
+; X64-NEXT: addq %rcx, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r15, %rbx
+; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movzbl %r12b, %ecx
+; X64-NEXT: adcq %rcx, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq 32(%rcx), %r12
-; X64-NEXT: imulq %r12, %r8
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r8, %rdx
-; X64-NEXT: movq 40(%rcx), %r8
-; X64-NEXT: imulq %r8, %r9
-; X64-NEXT: addq %rdx, %r9
+; X64-NEXT: movq 32(%rcx), %r15
+; X64-NEXT: imulq %r15, %rsi
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: movq 40(%rcx), %rsi
+; X64-NEXT: imulq %rsi, %r8
+; X64-NEXT: addq %rdx, %r8
; X64-NEXT: movq 48(%rcx), %rax
-; X64-NEXT: movq %rcx, %rbp
+; X64-NEXT: movq %rcx, %r11
; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT: imulq %r15, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: imulq %r14, %rdi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: movq 56(%rbp), %rbp
-; X64-NEXT: imulq %rbx, %rbp
-; X64-NEXT: addq %rdx, %rbp
-; X64-NEXT: addq %r11, %rcx
-; X64-NEXT: adcq %r9, %rbp
+; X64-NEXT: movq 56(%r11), %r11
+; X64-NEXT: imulq %rbx, %r11
+; X64-NEXT: addq %rdx, %r11
+; X64-NEXT: addq %r9, %rcx
+; X64-NEXT: adcq %r8, %r11
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq %rbx, %r11
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rbx, %r8
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r9, %rbx
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rbx, %r12
-; X64-NEXT: adcq %rdi, %rsi
-; X64-NEXT: setb %bl
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r8
+; X64-NEXT: addq %rdi, %rbx
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rsi, %r13
-; X64-NEXT: movzbl %bl, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq %rcx, %r13
-; X64-NEXT: adcq %rbp, %r15
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq 56(%rdx), %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: movq 48(%rdx), %rbx
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: imulq %r9, %rbx
-; X64-NEXT: addq %rdx, %rbx
-; X64-NEXT: movq 32(%r8), %rdi
-; X64-NEXT: movq 40(%r8), %r8
+; X64-NEXT: addq %rbx, %r13
+; X64-NEXT: adcq %r9, %r15
+; X64-NEXT: setb %dil
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %r15, %r8
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: adcq %rax, %r12
+; X64-NEXT: addq %rcx, %r8
+; X64-NEXT: adcq %r11, %r12
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq 56(%rcx), %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: imulq %rax, %rsi
+; X64-NEXT: movq 48(%rcx), %r11
+; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: imulq %r8, %rcx
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: imulq %rdi, %r10
-; X64-NEXT: addq %rdx, %r10
-; X64-NEXT: addq %rsi, %r11
-; X64-NEXT: adcq %rbx, %r10
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: imulq %r14, %r11
+; X64-NEXT: addq %rdx, %r11
+; X64-NEXT: movq 32(%rdi), %r9
+; X64-NEXT: movq 40(%rdi), %r15
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rbx, %rbp
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: imulq %r15, %rsi
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rbp, %rdi
-; X64-NEXT: adcq %rcx, %rbx
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movzbl %cl, %ecx
-; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %r11, %rax
+; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: imulq %r9, %r10
+; X64-NEXT: addq %rdx, %r10
+; X64-NEXT: addq %rcx, %rdi
+; X64-NEXT: adcq %r11, %r10
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %rsi, %rbx
+; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rbx, %r9
+; X64-NEXT: adcq %rbp, %rsi
+; X64-NEXT: setb %bl
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r14
+; X64-NEXT: addq %rsi, %rax
+; X64-NEXT: movzbl %bl, %esi
+; X64-NEXT: adcq %rsi, %rdx
+; X64-NEXT: addq %rdi, %rax
; X64-NEXT: adcq %r10, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: adcq %r12, %rdi
-; X64-NEXT: adcq %r13, %rax
-; X64-NEXT: adcq %r15, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT: adcq %r14, %rax
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: adcq %r13, %r9
+; X64-NEXT: adcq %r8, %rax
+; X64-NEXT: adcq %r12, %rdx
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: movq %rbp, (%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: movq %rbp, 8(%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: movq %rbp, 16(%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: movq %rbp, 24(%rcx)
-; X64-NEXT: movq %rsi, 32(%rcx)
-; X64-NEXT: movq %rdi, 40(%rcx)
-; X64-NEXT: movq %rax, 48(%rcx)
-; X64-NEXT: movq %rdx, 56(%rcx)
+; X64-NEXT: movq (%rsp), %rsi # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, (%rsi)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 8(%rsi)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 16(%rsi)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, 24(%rsi)
+; X64-NEXT: movq %rcx, 32(%rsi)
+; X64-NEXT: movq %r9, 40(%rsi)
+; X64-NEXT: movq %rax, 48(%rsi)
+; X64-NEXT: movq %rdx, 56(%rsi)
+; X64-NEXT: addq $8, %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X64-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index 87fc5bf76024c..9a6cf0b065662 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -7,64 +7,61 @@
define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp {
; CHECK-LABEL: x:
; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: pushq %r15
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: .cfi_offset %rbx, -32
-; CHECK-NEXT: .cfi_offset %r14, -24
-; CHECK-NEXT: .cfi_offset %r15, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: .cfi_offset %rbx, -24
+; CHECK-NEXT: .cfi_offset %r14, -16
; CHECK-NEXT: movq %rdx, %r11
-; CHECK-NEXT: movq %rsi, %r9
-; CHECK-NEXT: movq %rdi, %r15
-; CHECK-NEXT: sarq $63, %rsi
+; CHECK-NEXT: movq %rdi, %r9
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: sarq $63, %rbx
; CHECK-NEXT: movq %rdx, %rdi
-; CHECK-NEXT: imulq %rsi, %rdi
+; CHECK-NEXT: imulq %rbx, %rdi
; CHECK-NEXT: movq %rdx, %rax
-; CHECK-NEXT: mulq %rsi
+; CHECK-NEXT: mulq %rbx
; CHECK-NEXT: movq %rax, %r8
; CHECK-NEXT: addq %rdi, %rdx
-; CHECK-NEXT: imulq %rcx, %rsi
-; CHECK-NEXT: addq %rdx, %rsi
+; CHECK-NEXT: imulq %rcx, %rbx
+; CHECK-NEXT: addq %rdx, %rbx
; CHECK-NEXT: movq %rcx, %rdi
; CHECK-NEXT: sarq $63, %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: imulq %r9, %rbx
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: imulq %rsi, %r14
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %r15
+; CHECK-NEXT: mulq %r9
; CHECK-NEXT: movq %rax, %r10
-; CHECK-NEXT: addq %rbx, %rdx
-; CHECK-NEXT: imulq %r15, %rdi
+; CHECK-NEXT: addq %r14, %rdx
+; CHECK-NEXT: imulq %r9, %rdi
; CHECK-NEXT: addq %rdx, %rdi
; CHECK-NEXT: addq %r8, %r10
-; CHECK-NEXT: adcq %rsi, %rdi
-; CHECK-NEXT: movq %r15, %rax
-; CHECK-NEXT: mulq %r11
-; CHECK-NEXT: movq %rdx, %r14
-; CHECK-NEXT: movq %rax, %r8
+; CHECK-NEXT: adcq %rbx, %rdi
; CHECK-NEXT: movq %r9, %rax
; CHECK-NEXT: mulq %r11
; CHECK-NEXT: movq %rdx, %rbx
-; CHECK-NEXT: movq %rax, %rsi
-; CHECK-NEXT: addq %r14, %rsi
-; CHECK-NEXT: adcq $0, %rbx
-; CHECK-NEXT: movq %r15, %rax
+; CHECK-NEXT: movq %rax, %r8
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: mulq %r11
+; CHECK-NEXT: movq %rdx, %r11
+; CHECK-NEXT: movq %rax, %r14
+; CHECK-NEXT: addq %rbx, %r14
+; CHECK-NEXT: adcq $0, %r11
+; CHECK-NEXT: movq %r9, %rax
; CHECK-NEXT: mulq %rcx
-; CHECK-NEXT: movq %rdx, %r14
-; CHECK-NEXT: movq %rax, %r11
-; CHECK-NEXT: addq %rsi, %r11
-; CHECK-NEXT: adcq %rbx, %r14
+; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rax, %r9
+; CHECK-NEXT: addq %r14, %r9
+; CHECK-NEXT: adcq %r11, %rbx
; CHECK-NEXT: setb %al
-; CHECK-NEXT: movzbl %al, %esi
-; CHECK-NEXT: movq %r9, %rax
+; CHECK-NEXT: movzbl %al, %r11d
+; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: mulq %rcx
-; CHECK-NEXT: addq %r14, %rax
-; CHECK-NEXT: adcq %rsi, %rdx
+; CHECK-NEXT: addq %rbx, %rax
+; CHECK-NEXT: adcq %r11, %rdx
; CHECK-NEXT: addq %r10, %rax
; CHECK-NEXT: adcq %rdi, %rdx
-; CHECK-NEXT: movq %r11, %rcx
+; CHECK-NEXT: movq %r9, %rcx
; CHECK-NEXT: sarq $63, %rcx
; CHECK-NEXT: xorq %rcx, %rdx
; CHECK-NEXT: xorq %rax, %rcx
@@ -72,10 +69,9 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: ## %bb.2: ## %nooverflow
; CHECK-NEXT: movq %r8, %rax
-; CHECK-NEXT: movq %r11, %rdx
+; CHECK-NEXT: movq %r9, %rdx
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
-; CHECK-NEXT: popq %r15
; CHECK-NEXT: retq
; CHECK-NEXT: LBB0_1: ## %overflow
; CHECK-NEXT: ud2
diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll
index e722ab33894e2..1756154272018 100644
--- a/llvm/test/CodeGen/X86/musttail-varargs.ll
+++ b/llvm/test/CodeGen/X86/musttail-varargs.ll
@@ -46,12 +46,12 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; LINUX-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; LINUX-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; LINUX-NEXT: movq %r9, %r15
-; LINUX-NEXT: movq %r8, %r12
-; LINUX-NEXT: movq %rcx, %r13
-; LINUX-NEXT: movq %rdx, %rbp
-; LINUX-NEXT: movq %rsi, %rbx
-; LINUX-NEXT: movq %rdi, %r14
+; LINUX-NEXT: movq %r9, %r14
+; LINUX-NEXT: movq %r8, %r15
+; LINUX-NEXT: movq %rcx, %r12
+; LINUX-NEXT: movq %rdx, %r13
+; LINUX-NEXT: movq %rsi, %rbp
+; LINUX-NEXT: movq %rdi, %rbx
; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; LINUX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
@@ -77,13 +77,13 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; LINUX-NEXT: callq get_f at PLT
; LINUX-NEXT: movq %rax, %r11
-; LINUX-NEXT: movq %r14, %rdi
-; LINUX-NEXT: movq %rbx, %rsi
-; LINUX-NEXT: movq %rbp, %rdx
-; LINUX-NEXT: movq %r13, %rcx
-; LINUX-NEXT: movq %r12, %r8
+; LINUX-NEXT: movq %rbx, %rdi
+; LINUX-NEXT: movq %rbp, %rsi
+; LINUX-NEXT: movq %r13, %rdx
+; LINUX-NEXT: movq %r12, %rcx
+; LINUX-NEXT: movq %r15, %r8
; LINUX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; LINUX-NEXT: movq %r15, %r9
+; LINUX-NEXT: movq %r14, %r9
; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -139,12 +139,12 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-X32-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; LINUX-X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; LINUX-X32-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; LINUX-X32-NEXT: movq %r9, %r15
-; LINUX-X32-NEXT: movq %r8, %r12
-; LINUX-X32-NEXT: movq %rcx, %r13
-; LINUX-X32-NEXT: movq %rdx, %rbp
-; LINUX-X32-NEXT: movq %rsi, %rbx
-; LINUX-X32-NEXT: movq %rdi, %r14
+; LINUX-X32-NEXT: movq %r9, %r14
+; LINUX-X32-NEXT: movq %r8, %r15
+; LINUX-X32-NEXT: movq %rcx, %r12
+; LINUX-X32-NEXT: movq %rdx, %r13
+; LINUX-X32-NEXT: movq %rsi, %rbp
+; LINUX-X32-NEXT: movq %rdi, %rbx
; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp)
; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp)
; LINUX-X32-NEXT: movq %rcx, {{[0-9]+}}(%esp)
@@ -170,13 +170,13 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-X32-NEXT: movq %rax, {{[0-9]+}}(%esp)
; LINUX-X32-NEXT: callq get_f at PLT
; LINUX-X32-NEXT: movl %eax, %r11d
-; LINUX-X32-NEXT: movq %r14, %rdi
-; LINUX-X32-NEXT: movq %rbx, %rsi
-; LINUX-X32-NEXT: movq %rbp, %rdx
-; LINUX-X32-NEXT: movq %r13, %rcx
-; LINUX-X32-NEXT: movq %r12, %r8
+; LINUX-X32-NEXT: movq %rbx, %rdi
+; LINUX-X32-NEXT: movq %rbp, %rsi
+; LINUX-X32-NEXT: movq %r13, %rdx
+; LINUX-X32-NEXT: movq %r12, %rcx
+; LINUX-X32-NEXT: movq %r15, %r8
; LINUX-X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; LINUX-X32-NEXT: movq %r15, %r9
+; LINUX-X32-NEXT: movq %r14, %r9
; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -214,20 +214,20 @@ define void @f_thunk(ptr %this, ...) {
; WINDOWS-NEXT: subq $72, %rsp
; WINDOWS-NEXT: .seh_stackalloc 72
; WINDOWS-NEXT: .seh_endprologue
-; WINDOWS-NEXT: movq %r9, %r14
+; WINDOWS-NEXT: movq %r9, %rsi
; WINDOWS-NEXT: movq %r8, %rdi
; WINDOWS-NEXT: movq %rdx, %rbx
-; WINDOWS-NEXT: movq %rcx, %rsi
+; WINDOWS-NEXT: movq %rcx, %r14
; WINDOWS-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp)
; WINDOWS-NEXT: movq %r9, {{[0-9]+}}(%rsp)
; WINDOWS-NEXT: leaq {{[0-9]+}}(%rsp), %rax
; WINDOWS-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; WINDOWS-NEXT: callq get_f
-; WINDOWS-NEXT: movq %rsi, %rcx
+; WINDOWS-NEXT: movq %r14, %rcx
; WINDOWS-NEXT: movq %rbx, %rdx
; WINDOWS-NEXT: movq %rdi, %r8
-; WINDOWS-NEXT: movq %r14, %r9
+; WINDOWS-NEXT: movq %rsi, %r9
; WINDOWS-NEXT: addq $72, %rsp
; WINDOWS-NEXT: popq %rbx
; WINDOWS-NEXT: popq %rdi
diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll
index 4a0ba50fb4185..98d193a79cb74 100644
--- a/llvm/test/CodeGen/X86/nontemporal-loads.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll
@@ -1781,29 +1781,26 @@ define <16 x i32> @test_masked_v16i32(ptr %addr, <16 x i32> %old, <16 x i32> %ma
;
; SSE41-LABEL: test_masked_v16i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm7, %xmm9
-; SSE41-NEXT: movdqa %xmm6, %xmm10
-; SSE41-NEXT: movdqa %xmm5, %xmm11
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm9
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
; SSE41-NEXT: movntdqa 48(%rdi), %xmm4
-; SSE41-NEXT: movntdqa 32(%rdi), %xmm7
-; SSE41-NEXT: movntdqa 16(%rdi), %xmm6
-; SSE41-NEXT: movntdqa (%rdi), %xmm5
-; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm5
-; SSE41-NEXT: movdqa %xmm11, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm6
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm7
-; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm9
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm10
+; SSE41-NEXT: movntdqa (%rdi), %xmm11
+; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm11
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm10
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm9
+; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4
-; SSE41-NEXT: movaps %xmm5, %xmm0
-; SSE41-NEXT: movaps %xmm6, %xmm1
-; SSE41-NEXT: movaps %xmm7, %xmm2
+; SSE41-NEXT: movaps %xmm11, %xmm0
+; SSE41-NEXT: movaps %xmm10, %xmm1
+; SSE41-NEXT: movaps %xmm9, %xmm2
; SSE41-NEXT: movaps %xmm4, %xmm3
; SSE41-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 0e5539449b916..6ac734cad64c7 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -910,7 +910,7 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rdi), %xmm3
; SSE2-NEXT: movdqu 16(%rdi), %xmm2
-; SSE2-NEXT: movdqu 32(%rdi), %xmm8
+; SSE2-NEXT: movdqu 32(%rdi), %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pand %xmm1, %xmm4
@@ -921,7 +921,7 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,7,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1]
; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
@@ -938,12 +938,12 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,3,2,3,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,3,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6]
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pandn %xmm7, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm6, %xmm8
+; SSE2-NEXT: pandn %xmm7, %xmm8
+; SSE2-NEXT: por %xmm5, %xmm8
; SSE2-NEXT: pand %xmm4, %xmm2
; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: por %xmm2, %xmm4
@@ -952,12 +952,12 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,7,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
-; SSE2-NEXT: pandn %xmm3, %xmm6
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; SSE2-NEXT: pandn %xmm0, %xmm6
; SSE2-NEXT: por %xmm2, %xmm6
; SSE2-NEXT: movups %xmm1, (%rsi)
-; SSE2-NEXT: movdqu %xmm0, (%rdx)
+; SSE2-NEXT: movdqu %xmm8, (%rdx)
; SSE2-NEXT: movdqu %xmm6, (%rcx)
; SSE2-NEXT: retq
;
@@ -1057,7 +1057,7 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE2-LABEL: interleave_24i16_out_reverse:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqu (%rdi), %xmm8
+; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movdqu 16(%rdi), %xmm1
; SSE2-NEXT: movdqu 32(%rdi), %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0]
@@ -1069,7 +1069,7 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nou
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,5,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1]
; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
@@ -1086,12 +1086,12 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nou
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7]
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,7,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pandn %xmm7, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm6, %xmm8
+; SSE2-NEXT: pandn %xmm7, %xmm8
+; SSE2-NEXT: por %xmm5, %xmm8
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
@@ -1101,13 +1101,13 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nou
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,3,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
-; SSE2-NEXT: pandn %xmm3, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
+; SSE2-NEXT: pandn %xmm0, %xmm6
; SSE2-NEXT: por %xmm1, %xmm6
; SSE2-NEXT: movups %xmm2, (%rsi)
-; SSE2-NEXT: movdqu %xmm0, (%rdx)
+; SSE2-NEXT: movdqu %xmm8, (%rdx)
; SSE2-NEXT: movdqu %xmm6, (%rcx)
; SSE2-NEXT: retq
;
@@ -1393,75 +1393,75 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE2-LABEL: interleave_24i32_out:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqu 64(%rdi), %xmm9
-; SSE2-NEXT: movups 80(%rdi), %xmm8
+; SSE2-NEXT: movdqu 64(%rdi), %xmm1
+; SSE2-NEXT: movups 80(%rdi), %xmm4
; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movdqu 16(%rdi), %xmm10
+; SSE2-NEXT: movdqu 16(%rdi), %xmm2
; SSE2-NEXT: movups 32(%rdi), %xmm5
; SSE2-NEXT: movdqu 48(%rdi), %xmm3
; SSE2-NEXT: movaps %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,1,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm10[2,0]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm5[2,0]
-; SSE2-NEXT: movaps %xmm8, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm9[2,0]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm8[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm9[3,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[2,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm5[2,0]
+; SSE2-NEXT: movaps %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[2,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm10
+; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm4[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm1[3,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm10[3,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm2[3,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0]
-; SSE2-NEXT: movups %xmm2, 16(%rsi)
-; SSE2-NEXT: movups %xmm4, (%rsi)
+; SSE2-NEXT: movups %xmm10, 16(%rsi)
+; SSE2-NEXT: movups %xmm8, (%rsi)
; SSE2-NEXT: movups %xmm3, 16(%rdx)
; SSE2-NEXT: movups %xmm0, (%rdx)
-; SSE2-NEXT: movups %xmm1, 16(%rcx)
+; SSE2-NEXT: movups %xmm9, 16(%rcx)
; SSE2-NEXT: movups %xmm7, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_out:
; SSE42: # %bb.0:
-; SSE42-NEXT: movups 80(%rdi), %xmm8
-; SSE42-NEXT: movdqu 64(%rdi), %xmm9
+; SSE42-NEXT: movups 80(%rdi), %xmm0
+; SSE42-NEXT: movdqu 64(%rdi), %xmm1
; SSE42-NEXT: movdqu (%rdi), %xmm3
; SSE42-NEXT: movdqu 16(%rdi), %xmm2
-; SSE42-NEXT: movups 32(%rdi), %xmm10
+; SSE42-NEXT: movups 32(%rdi), %xmm4
; SSE42-NEXT: movdqu 48(%rdi), %xmm5
; SSE42-NEXT: movdqa %xmm2, %xmm6
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[2,3]
-; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[1]
-; SSE42-NEXT: movdqa %xmm9, %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
-; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm9[2,3]
-; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,2,2]
+; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[1]
+; SSE42-NEXT: movdqa %xmm1, %xmm8
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3]
+; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[2,3]
+; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2]
; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3]
-; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm4[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,3]
-; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2]
-; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm10[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,3]
+; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
+; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5],xmm10[6,7]
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7]
-; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,3]
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7]
-; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,3]
+; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,3]
+; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3],xmm9[4,5,6,7]
+; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3]
; SSE42-NEXT: movups %xmm5, 16(%rsi)
; SSE42-NEXT: movups %xmm3, (%rsi)
-; SSE42-NEXT: movdqu %xmm4, 16(%rdx)
+; SSE42-NEXT: movdqu %xmm10, 16(%rdx)
; SSE42-NEXT: movdqu %xmm6, (%rdx)
-; SSE42-NEXT: movups %xmm0, 16(%rcx)
+; SSE42-NEXT: movups %xmm9, 16(%rcx)
; SSE42-NEXT: movups %xmm7, (%rcx)
; SSE42-NEXT: retq
;
@@ -1633,35 +1633,35 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE2-NEXT: movups 16(%rsi), %xmm0
; SSE2-NEXT: movups (%rdx), %xmm2
; SSE2-NEXT: movups 16(%rdx), %xmm5
-; SSE2-NEXT: movups (%rcx), %xmm8
-; SSE2-NEXT: movups 16(%rcx), %xmm9
-; SSE2-NEXT: movaps %xmm8, %xmm7
+; SSE2-NEXT: movups (%rcx), %xmm4
+; SSE2-NEXT: movups 16(%rcx), %xmm6
+; SSE2-NEXT: movaps %xmm4, %xmm7
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[1,3]
; SSE2-NEXT: movaps %xmm1, %xmm3
; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0,2]
; SSE2-NEXT: movaps %xmm0, %xmm7
; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1]
-; SSE2-NEXT: movaps %xmm9, %xmm6
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[1,3]
-; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: movaps %xmm6, %xmm8
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3]
+; SSE2-NEXT: movaps %xmm0, %xmm9
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm9[2,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm5[1,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm7[0,2]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm6[2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm5[1,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[0,2]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[0,2]
; SSE2-NEXT: movaps %xmm1, %xmm5
; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm8[2,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
-; SSE2-NEXT: movups %xmm8, 16(%rdi)
-; SSE2-NEXT: movups %xmm4, 48(%rdi)
-; SSE2-NEXT: movups %xmm9, 64(%rdi)
+; SSE2-NEXT: movups %xmm4, 16(%rdi)
+; SSE2-NEXT: movups %xmm9, 48(%rdi)
+; SSE2-NEXT: movups %xmm6, 64(%rdi)
; SSE2-NEXT: movups %xmm3, (%rdi)
; SSE2-NEXT: movups %xmm1, 32(%rdi)
; SSE2-NEXT: movups %xmm0, 80(%rdi)
@@ -1671,38 +1671,38 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE42: # %bb.0:
; SSE42-NEXT: movdqu (%rsi), %xmm0
; SSE42-NEXT: movdqu 16(%rsi), %xmm4
-; SSE42-NEXT: movdqu (%rdx), %xmm9
+; SSE42-NEXT: movdqu (%rdx), %xmm2
; SSE42-NEXT: movdqu 16(%rdx), %xmm5
; SSE42-NEXT: movdqu (%rcx), %xmm3
; SSE42-NEXT: movdqu 16(%rcx), %xmm6
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1]
; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,0,1]
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3],xmm7[4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7]
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,2,2]
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,1,1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5],xmm9[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2]
+; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm0[4,5],xmm9[6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm3[2,3],xmm9[4,5,6,7]
; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7]
-; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm9[3,3]
-; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3,4,5],xmm3[6,7]
-; SSE42-NEXT: movdqu %xmm3, 32(%rdi)
+; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
+; SSE42-NEXT: movdqu %xmm2, 32(%rdi)
; SSE42-NEXT: movdqu %xmm5, 80(%rdi)
-; SSE42-NEXT: movdqu %xmm2, 16(%rdi)
-; SSE42-NEXT: movdqu %xmm1, 48(%rdi)
+; SSE42-NEXT: movdqu %xmm9, 16(%rdi)
+; SSE42-NEXT: movdqu %xmm8, 48(%rdi)
; SSE42-NEXT: movdqu %xmm7, 64(%rdi)
-; SSE42-NEXT: movdqu %xmm8, (%rdi)
+; SSE42-NEXT: movdqu %xmm1, (%rdi)
; SSE42-NEXT: retq
;
; AVX1-LABEL: interleave_24i32_in:
@@ -2009,19 +2009,19 @@ define void @splat3_128(<16 x i8> %a0, <16 x i8> %a1, ptr%a2) {
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm8 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
-; XOP-NEXT: vpperm %xmm3, %xmm4, %xmm2, %xmm2
-; XOP-NEXT: vpperm %xmm3, %xmm0, %xmm7, %xmm0
-; XOP-NEXT: vpperm %xmm3, %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpperm %xmm3, %xmm1, %xmm6, %xmm1
-; XOP-NEXT: vpperm %xmm3, %xmm5, %xmm8, %xmm7
-; XOP-NEXT: vpperm %xmm3, %xmm6, %xmm5, %xmm3
-; XOP-NEXT: vmovdqa %xmm3, 80(%rdi)
-; XOP-NEXT: vmovdqa %xmm7, 64(%rdi)
+; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
+; XOP-NEXT: vpperm %xmm8, %xmm4, %xmm2, %xmm2
+; XOP-NEXT: vpperm %xmm8, %xmm0, %xmm7, %xmm0
+; XOP-NEXT: vpperm %xmm8, %xmm7, %xmm4, %xmm4
+; XOP-NEXT: vpperm %xmm8, %xmm1, %xmm6, %xmm1
+; XOP-NEXT: vpperm %xmm8, %xmm5, %xmm3, %xmm3
+; XOP-NEXT: vpperm %xmm8, %xmm6, %xmm5, %xmm5
+; XOP-NEXT: vmovdqa %xmm5, 80(%rdi)
+; XOP-NEXT: vmovdqa %xmm3, 64(%rdi)
; XOP-NEXT: vmovdqa %xmm1, 48(%rdi)
; XOP-NEXT: vmovdqa %xmm4, 32(%rdi)
; XOP-NEXT: vmovdqa %xmm2, 16(%rdi)
@@ -2181,19 +2181,19 @@ define void @splat3_256(<32 x i8> %a0, ptr%a1) {
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm8 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
-; XOP-NEXT: vpperm %xmm3, %xmm4, %xmm1, %xmm1
-; XOP-NEXT: vpperm %xmm3, %xmm0, %xmm7, %xmm0
-; XOP-NEXT: vpperm %xmm3, %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpperm %xmm3, %xmm2, %xmm6, %xmm2
-; XOP-NEXT: vpperm %xmm3, %xmm5, %xmm8, %xmm7
-; XOP-NEXT: vpperm %xmm3, %xmm6, %xmm5, %xmm3
-; XOP-NEXT: vmovdqa %xmm3, 80(%rdi)
-; XOP-NEXT: vmovdqa %xmm7, 64(%rdi)
+; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
+; XOP-NEXT: vpperm %xmm8, %xmm4, %xmm1, %xmm1
+; XOP-NEXT: vpperm %xmm8, %xmm0, %xmm7, %xmm0
+; XOP-NEXT: vpperm %xmm8, %xmm7, %xmm4, %xmm4
+; XOP-NEXT: vpperm %xmm8, %xmm2, %xmm6, %xmm2
+; XOP-NEXT: vpperm %xmm8, %xmm5, %xmm3, %xmm3
+; XOP-NEXT: vpperm %xmm8, %xmm6, %xmm5, %xmm5
+; XOP-NEXT: vmovdqa %xmm5, 80(%rdi)
+; XOP-NEXT: vmovdqa %xmm3, 64(%rdi)
; XOP-NEXT: vmovdqa %xmm2, 48(%rdi)
; XOP-NEXT: vmovdqa %xmm4, 32(%rdi)
; XOP-NEXT: vmovdqa %xmm1, 16(%rdi)
diff --git a/llvm/test/CodeGen/X86/or-address.ll b/llvm/test/CodeGen/X86/or-address.ll
index 0931f47e71226..be8578d3dd9b4 100644
--- a/llvm/test/CodeGen/X86/or-address.ll
+++ b/llvm/test/CodeGen/X86/or-address.ll
@@ -47,10 +47,10 @@ return: ; preds = %bb
}
; CHECK-LABEL: test1:
-; CHECK: movl %{{.*}}, (%[[RDI:...]],%[[RCX:...]],4)
-; CHECK: movl %{{.*}}, 8(%[[RDI]],%[[RCX]],4)
-; CHECK: movl %{{.*}}, 4(%[[RDI]],%[[RCX]],4)
-; CHECK: movl %{{.*}}, 12(%[[RDI]],%[[RCX]],4)
+; CHECK: movl %{{.*}}, (%[[BASE:r.*]],%[[INDEX:r.*]],4)
+; CHECK: movl %{{.*}}, 8(%[[BASE]],%[[INDEX]],4)
+; CHECK: movl %{{.*}}, 4(%[[BASE]],%[[INDEX]],4)
+; CHECK: movl %{{.*}}, 12(%[[BASE]],%[[INDEX]],4)
define void @test1(ptr nocapture %array, i32 %r0, i8 signext %k, i8 signext %i0) nounwind {
bb.nph:
diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll
index d480ea722fbdb..766c681cd364b 100644
--- a/llvm/test/CodeGen/X86/paddus.ll
+++ b/llvm/test/CodeGen/X86/paddus.ll
@@ -397,23 +397,23 @@ define <32 x i8> @test12(<32 x i8> %x) {
define <64 x i8> @test13(<64 x i8> %x) {
; SSE-LABEL: test13:
; SSE: # %bb.0:
-; SSE-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE-NEXT: pcmpeqd %xmm4, %xmm4
; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: psubb %xmm8, %xmm5
+; SSE-NEXT: psubb %xmm4, %xmm5
; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: psubb %xmm8, %xmm6
+; SSE-NEXT: psubb %xmm4, %xmm6
; SSE-NEXT: movdqa %xmm1, %xmm7
-; SSE-NEXT: psubb %xmm8, %xmm7
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: psubb %xmm8, %xmm4
-; SSE-NEXT: pcmpeqb %xmm8, %xmm3
+; SSE-NEXT: psubb %xmm4, %xmm7
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: psubb %xmm4, %xmm8
+; SSE-NEXT: pcmpeqb %xmm4, %xmm3
; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: pcmpeqb %xmm8, %xmm2
+; SSE-NEXT: pcmpeqb %xmm4, %xmm2
; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: pcmpeqb %xmm8, %xmm1
+; SSE-NEXT: pcmpeqb %xmm4, %xmm1
; SSE-NEXT: por %xmm7, %xmm1
-; SSE-NEXT: pcmpeqb %xmm8, %xmm0
-; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE-NEXT: por %xmm8, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test13:
@@ -1150,23 +1150,23 @@ define <16 x i16> @test30(<16 x i16> %x) {
define <32 x i16> @test31(<32 x i16> %x) {
; SSE-LABEL: test31:
; SSE: # %bb.0:
-; SSE-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE-NEXT: pcmpeqd %xmm4, %xmm4
; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: psubw %xmm8, %xmm5
+; SSE-NEXT: psubw %xmm4, %xmm5
; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: psubw %xmm8, %xmm6
+; SSE-NEXT: psubw %xmm4, %xmm6
; SSE-NEXT: movdqa %xmm1, %xmm7
-; SSE-NEXT: psubw %xmm8, %xmm7
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: psubw %xmm8, %xmm4
-; SSE-NEXT: pcmpeqw %xmm8, %xmm3
+; SSE-NEXT: psubw %xmm4, %xmm7
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: psubw %xmm4, %xmm8
+; SSE-NEXT: pcmpeqw %xmm4, %xmm3
; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: pcmpeqw %xmm8, %xmm2
+; SSE-NEXT: pcmpeqw %xmm4, %xmm2
; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: pcmpeqw %xmm8, %xmm1
+; SSE-NEXT: pcmpeqw %xmm4, %xmm1
; SSE-NEXT: por %xmm7, %xmm1
-; SSE-NEXT: pcmpeqw %xmm8, %xmm0
-; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: pcmpeqw %xmm4, %xmm0
+; SSE-NEXT: por %xmm8, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test31:
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 8dd6ce2b3057b..1b2dae5f2830a 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -870,17 +870,17 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
; SSE2-NEXT: pmullw %xmm4, %xmm0
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: packuswb %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: pmullw %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm8, %xmm9
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm5, %xmm1
; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: packuswb %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -1237,67 +1237,67 @@ entry:
define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE2-LABEL: mul_v8i64_sext:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm15
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT: psrad $16, %xmm14
-; SSE2-NEXT: pxor %xmm13, %xmm13
-; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm14, %xmm10
-; SSE2-NEXT: movdqa %xmm14, %xmm8
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: psrad $16, %xmm6
+; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm11
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE2-NEXT: pxor %xmm11, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm11
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm9
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
-; SSE2-NEXT: pxor %xmm12, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm12
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1]
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm8
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; SSE2-NEXT: pcmpgtd %xmm15, %xmm13
-; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3]
-; SSE2-NEXT: pmuludq %xmm15, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,1,3]
-; SSE2-NEXT: pmuludq %xmm0, %xmm4
-; SSE2-NEXT: paddq %xmm6, %xmm4
-; SSE2-NEXT: psllq $32, %xmm4
-; SSE2-NEXT: pmuludq %xmm15, %xmm0
-; SSE2-NEXT: paddq %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,3,3]
+; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm13
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm12
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm11[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm12
+; SSE2-NEXT: paddq %xmm14, %xmm12
+; SSE2-NEXT: psllq $32, %xmm12
+; SSE2-NEXT: pmuludq %xmm4, %xmm0
+; SSE2-NEXT: paddq %xmm12, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3]
-; SSE2-NEXT: pmuludq %xmm11, %xmm5
-; SSE2-NEXT: paddq %xmm4, %xmm5
-; SSE2-NEXT: psllq $32, %xmm5
-; SSE2-NEXT: pmuludq %xmm11, %xmm1
-; SSE2-NEXT: paddq %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm9, %xmm11
+; SSE2-NEXT: paddq %xmm4, %xmm11
+; SSE2-NEXT: psllq $32, %xmm11
+; SSE2-NEXT: pmuludq %xmm9, %xmm1
+; SSE2-NEXT: paddq %xmm11, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,1,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,1,3]
-; SSE2-NEXT: pmuludq %xmm14, %xmm5
-; SSE2-NEXT: paddq %xmm4, %xmm5
-; SSE2-NEXT: psllq $32, %xmm5
-; SSE2-NEXT: pmuludq %xmm14, %xmm2
-; SSE2-NEXT: paddq %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm9
+; SSE2-NEXT: paddq %xmm4, %xmm9
+; SSE2-NEXT: psllq $32, %xmm9
+; SSE2-NEXT: pmuludq %xmm6, %xmm2
+; SSE2-NEXT: paddq %xmm9, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3]
; SSE2-NEXT: pmuludq %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,1,3]
-; SSE2-NEXT: pmuludq %xmm8, %xmm5
-; SSE2-NEXT: paddq %xmm4, %xmm5
-; SSE2-NEXT: psllq $32, %xmm5
-; SSE2-NEXT: pmuludq %xmm8, %xmm3
-; SSE2-NEXT: paddq %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm5, %xmm6
+; SSE2-NEXT: paddq %xmm4, %xmm6
+; SSE2-NEXT: psllq $32, %xmm6
+; SSE2-NEXT: pmuludq %xmm5, %xmm3
+; SSE2-NEXT: paddq %xmm6, %xmm3
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v8i64_sext:
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index 442d853cf5e36..1d7aa7f2586fe 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -318,44 +318,42 @@ define <16 x i16> @zext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: and_mulhuw_v16i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm6, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
+; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pand %xmm6, %xmm8
-; SSE2-NEXT: packssdw %xmm7, %xmm8
-; SSE2-NEXT: pmulhw %xmm2, %xmm8
-; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: packssdw %xmm5, %xmm6
-; SSE2-NEXT: pmulhw %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: packssdw %xmm7, %xmm6
+; SSE2-NEXT: pmulhw %xmm2, %xmm6
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm8
+; SSE2-NEXT: packssdw %xmm5, %xmm8
+; SSE2-NEXT: pmulhw %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm6, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: and_mulhuw_v16i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm6, %xmm8
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767]
-; SSE41-NEXT: pand %xmm6, %xmm3
-; SSE41-NEXT: pand %xmm6, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
+; SSE41-NEXT: pand %xmm8, %xmm3
+; SSE41-NEXT: pand %xmm8, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: pand %xmm6, %xmm1
-; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: pand %xmm8, %xmm1
+; SSE41-NEXT: pand %xmm8, %xmm0
; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: pand %xmm6, %xmm7
-; SSE41-NEXT: pand %xmm6, %xmm8
-; SSE41-NEXT: packusdw %xmm7, %xmm8
-; SSE41-NEXT: pmulhw %xmm2, %xmm8
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: pand %xmm4, %xmm6
-; SSE41-NEXT: packusdw %xmm5, %xmm6
-; SSE41-NEXT: pmulhw %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm8, %xmm1
+; SSE41-NEXT: pand %xmm8, %xmm7
+; SSE41-NEXT: pand %xmm8, %xmm6
+; SSE41-NEXT: packusdw %xmm7, %xmm6
+; SSE41-NEXT: pmulhw %xmm2, %xmm6
+; SSE41-NEXT: pand %xmm8, %xmm5
+; SSE41-NEXT: pand %xmm4, %xmm8
+; SSE41-NEXT: packusdw %xmm5, %xmm8
+; SSE41-NEXT: pmulhw %xmm8, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: and_mulhuw_v16i16:
@@ -1084,18 +1082,18 @@ define <32 x i32> @zext_mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: pmulhuw %xmm4, %xmm0
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT: pxor %xmm8, %xmm8
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
; SSE41-NEXT: pmulhuw %xmm5, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
; SSE41-NEXT: pmulhuw %xmm6, %xmm2
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
; SSE41-NEXT: pmulhuw %xmm7, %xmm3
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
; SSE41-NEXT: movdqa %xmm3, 112(%rdi)
; SSE41-NEXT: movdqa %xmm7, 96(%rdi)
; SSE41-NEXT: movdqa %xmm2, 80(%rdi)
@@ -1103,7 +1101,7 @@ define <32 x i32> @zext_mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
; SSE41-NEXT: movdqa %xmm1, 48(%rdi)
; SSE41-NEXT: movdqa %xmm5, 32(%rdi)
; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
-; SSE41-NEXT: movdqa %xmm8, (%rdi)
+; SSE41-NEXT: movdqa %xmm4, (%rdi)
; SSE41-NEXT: retq
;
; AVX2-LABEL: zext_mulhuw_v32i16_lshr:
@@ -1179,18 +1177,18 @@ define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: pmulhw %xmm4, %xmm0
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT: pxor %xmm8, %xmm8
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
; SSE41-NEXT: pmulhw %xmm5, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
; SSE41-NEXT: pmulhw %xmm6, %xmm2
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
; SSE41-NEXT: pmulhw %xmm7, %xmm3
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
; SSE41-NEXT: movdqa %xmm3, 112(%rdi)
; SSE41-NEXT: movdqa %xmm7, 96(%rdi)
; SSE41-NEXT: movdqa %xmm2, 80(%rdi)
@@ -1198,7 +1196,7 @@ define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
; SSE41-NEXT: movdqa %xmm1, 48(%rdi)
; SSE41-NEXT: movdqa %xmm5, 32(%rdi)
; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
-; SSE41-NEXT: movdqa %xmm8, (%rdi)
+; SSE41-NEXT: movdqa %xmm4, (%rdi)
; SSE41-NEXT: retq
;
; AVX2-LABEL: mulhsw_v32i16_lshr:
@@ -1709,58 +1707,58 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm8
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT: psrad $16, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3]
-; SSE2-NEXT: psrad $16, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
-; SSE2-NEXT: psrad $16, %xmm11
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
+; SSE2-NEXT: psrad $16, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
-; SSE2-NEXT: psrad $16, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
-; SSE2-NEXT: psrad $16, %xmm13
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
+; SSE2-NEXT: psrad $16, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3]
-; SSE2-NEXT: psrad $16, %xmm14
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7]
-; SSE2-NEXT: psrad $16, %xmm15
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
+; SSE2-NEXT: psrad $16, %xmm11
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrad $16, %xmm3
; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; SSE2-NEXT: psrad $16, %xmm12
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
+; SSE2-NEXT: psrad $16, %xmm13
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
+; SSE2-NEXT: psrad $16, %xmm14
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $16, %xmm6
; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3]
+; SSE2-NEXT: psrad $16, %xmm15
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $16, %xmm7
; SSE2-NEXT: movdqa %xmm7, 240(%rdi)
-; SSE2-NEXT: movdqa %xmm3, 224(%rdi)
+; SSE2-NEXT: movdqa %xmm15, 224(%rdi)
; SSE2-NEXT: movdqa %xmm6, 208(%rdi)
-; SSE2-NEXT: movdqa %xmm1, 192(%rdi)
+; SSE2-NEXT: movdqa %xmm14, 192(%rdi)
; SSE2-NEXT: movdqa %xmm5, 176(%rdi)
-; SSE2-NEXT: movdqa %xmm2, 160(%rdi)
+; SSE2-NEXT: movdqa %xmm13, 160(%rdi)
; SSE2-NEXT: movdqa %xmm4, 144(%rdi)
-; SSE2-NEXT: movdqa %xmm0, 128(%rdi)
-; SSE2-NEXT: movdqa %xmm15, 112(%rdi)
-; SSE2-NEXT: movdqa %xmm14, 96(%rdi)
-; SSE2-NEXT: movdqa %xmm13, 80(%rdi)
-; SSE2-NEXT: movdqa %xmm12, 64(%rdi)
-; SSE2-NEXT: movdqa %xmm11, 48(%rdi)
-; SSE2-NEXT: movdqa %xmm10, 32(%rdi)
-; SSE2-NEXT: movdqa %xmm9, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm12, 128(%rdi)
+; SSE2-NEXT: movdqa %xmm3, 112(%rdi)
+; SSE2-NEXT: movdqa %xmm11, 96(%rdi)
+; SSE2-NEXT: movdqa %xmm2, 80(%rdi)
+; SSE2-NEXT: movdqa %xmm10, 64(%rdi)
+; SSE2-NEXT: movdqa %xmm1, 48(%rdi)
+; SSE2-NEXT: movdqa %xmm9, 32(%rdi)
+; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
; SSE2-NEXT: movdqa %xmm8, (%rdi)
; SSE2-NEXT: retq
;
@@ -1770,50 +1768,50 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
; SSE41-NEXT: pmovsxwd %xmm0, %xmm8
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm9
+; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT: pmovsxwd %xmm1, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm11
+; SSE41-NEXT: pmovsxwd %xmm1, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
-; SSE41-NEXT: pmovsxwd %xmm2, %xmm12
+; SSE41-NEXT: pmovsxwd %xmm2, %xmm10
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; SSE41-NEXT: pmovsxwd %xmm2, %xmm13
+; SSE41-NEXT: pmovsxwd %xmm2, %xmm2
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
-; SSE41-NEXT: pmovsxwd %xmm3, %xmm14
+; SSE41-NEXT: pmovsxwd %xmm3, %xmm11
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; SSE41-NEXT: pmovsxwd %xmm3, %xmm15
+; SSE41-NEXT: pmovsxwd %xmm3, %xmm3
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
-; SSE41-NEXT: pmovsxwd %xmm4, %xmm0
+; SSE41-NEXT: pmovsxwd %xmm4, %xmm12
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm4, %xmm4
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
-; SSE41-NEXT: pmovsxwd %xmm5, %xmm2
+; SSE41-NEXT: pmovsxwd %xmm5, %xmm13
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm5, %xmm5
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
-; SSE41-NEXT: pmovsxwd %xmm6, %xmm1
+; SSE41-NEXT: pmovsxwd %xmm6, %xmm14
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm6, %xmm6
; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
-; SSE41-NEXT: pmovsxwd %xmm7, %xmm3
+; SSE41-NEXT: pmovsxwd %xmm7, %xmm15
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm7, %xmm7
; SSE41-NEXT: movdqa %xmm7, 240(%rdi)
-; SSE41-NEXT: movdqa %xmm3, 224(%rdi)
+; SSE41-NEXT: movdqa %xmm15, 224(%rdi)
; SSE41-NEXT: movdqa %xmm6, 208(%rdi)
-; SSE41-NEXT: movdqa %xmm1, 192(%rdi)
+; SSE41-NEXT: movdqa %xmm14, 192(%rdi)
; SSE41-NEXT: movdqa %xmm5, 176(%rdi)
-; SSE41-NEXT: movdqa %xmm2, 160(%rdi)
+; SSE41-NEXT: movdqa %xmm13, 160(%rdi)
; SSE41-NEXT: movdqa %xmm4, 144(%rdi)
-; SSE41-NEXT: movdqa %xmm0, 128(%rdi)
-; SSE41-NEXT: movdqa %xmm15, 112(%rdi)
-; SSE41-NEXT: movdqa %xmm14, 96(%rdi)
-; SSE41-NEXT: movdqa %xmm13, 80(%rdi)
-; SSE41-NEXT: movdqa %xmm12, 64(%rdi)
-; SSE41-NEXT: movdqa %xmm11, 48(%rdi)
-; SSE41-NEXT: movdqa %xmm10, 32(%rdi)
-; SSE41-NEXT: movdqa %xmm9, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm12, 128(%rdi)
+; SSE41-NEXT: movdqa %xmm3, 112(%rdi)
+; SSE41-NEXT: movdqa %xmm11, 96(%rdi)
+; SSE41-NEXT: movdqa %xmm2, 80(%rdi)
+; SSE41-NEXT: movdqa %xmm10, 64(%rdi)
+; SSE41-NEXT: movdqa %xmm1, 48(%rdi)
+; SSE41-NEXT: movdqa %xmm9, 32(%rdi)
+; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
; SSE41-NEXT: movdqa %xmm8, (%rdi)
; SSE41-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 78012b3d514e1..43110af64c77e 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -394,10 +394,10 @@ define i128 @cnt128(i128 %x) nounwind readnone {
; X64-NEXT: movq %rax, %rdx
; X64-NEXT: shrq $4, %rdx
; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %r9, %rdx
-; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101
-; X64-NEXT: imulq %rsi, %rdx
+; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %rsi, %rdx
+; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101
+; X64-NEXT: imulq %r9, %rdx
; X64-NEXT: shrq $56, %rdx
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq %rax
@@ -411,8 +411,8 @@ define i128 @cnt128(i128 %x) nounwind readnone {
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: shrq $4, %rax
; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: andq %r9, %rax
-; X64-NEXT: imulq %rsi, %rax
+; X64-NEXT: andq %rsi, %rax
+; X64-NEXT: imulq %r9, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: addq %rdx, %rax
; X64-NEXT: xorl %edx, %edx
@@ -899,10 +899,10 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
; X64-NEXT: movq %rax, %rdx
; X64-NEXT: shrq $4, %rdx
; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %r9, %rdx
-; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101
-; X64-NEXT: imulq %rsi, %rdx
+; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %rsi, %rdx
+; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101
+; X64-NEXT: imulq %r9, %rdx
; X64-NEXT: shrq $56, %rdx
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq %rax
@@ -916,8 +916,8 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: shrq $4, %rax
; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: andq %r9, %rax
-; X64-NEXT: imulq %rsi, %rax
+; X64-NEXT: andq %rsi, %rax
+; X64-NEXT: imulq %r9, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: addq %rdx, %rax
; X64-NEXT: xorl %edx, %edx
@@ -1329,10 +1329,10 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
; X64-NEXT: movq %rax, %rdx
; X64-NEXT: shrq $4, %rdx
; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %r9, %rdx
-; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101
-; X64-NEXT: imulq %rsi, %rdx
+; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %rsi, %rdx
+; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101
+; X64-NEXT: imulq %r9, %rdx
; X64-NEXT: shrq $56, %rdx
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq %rax
@@ -1346,8 +1346,8 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: shrq $4, %rax
; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: andq %r9, %rax
-; X64-NEXT: imulq %rsi, %rax
+; X64-NEXT: andq %rsi, %rax
+; X64-NEXT: imulq %r9, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: addq %rdx, %rax
; X64-NEXT: xorl %edx, %edx
diff --git a/llvm/test/CodeGen/X86/pr18344.ll b/llvm/test/CodeGen/X86/pr18344.ll
index 07abd1395ac6e..75a55e6a4bf5e 100644
--- a/llvm/test/CodeGen/X86/pr18344.ll
+++ b/llvm/test/CodeGen/X86/pr18344.ll
@@ -37,13 +37,13 @@ define void @FFT(ptr noalias nocapture %destination, ptr noalias %re, ptr noalia
; X64-NEXT: movdqu (%rdx), %xmm0
; X64-NEXT: pslld $4, %xmm0
; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: movslq %eax, %r8
+; X64-NEXT: cltq
; X64-NEXT: pextrd $1, %xmm0, %ecx
; X64-NEXT: movslq %ecx, %rcx
; X64-NEXT: pextrd $2, %xmm0, %edx
; X64-NEXT: movslq %edx, %rdx
-; X64-NEXT: pextrd $3, %xmm0, %eax
-; X64-NEXT: cltq
+; X64-NEXT: pextrd $3, %xmm0, %r8d
+; X64-NEXT: movslq %r8d, %r8
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/pr21792.ll b/llvm/test/CodeGen/X86/pr21792.ll
index 57df699750d97..af6a616fda3c5 100644
--- a/llvm/test/CodeGen/X86/pr21792.ll
+++ b/llvm/test/CodeGen/X86/pr21792.ll
@@ -12,16 +12,16 @@ define void @func(<4 x float> %vx) {
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: movd %xmm0, %r8d
-; CHECK-NEXT: leaq stuff(%r8), %rdi
-; CHECK-NEXT: pextrd $1, %xmm0, %eax
-; CHECK-NEXT: leaq stuff(%rax), %rsi
-; CHECK-NEXT: pextrd $2, %xmm0, %edx
-; CHECK-NEXT: pextrd $3, %xmm0, %ecx
-; CHECK-NEXT: leaq stuff(%rdx), %rdx
-; CHECK-NEXT: leaq stuff(%rcx), %rcx
-; CHECK-NEXT: leaq stuff+8(%r8), %r8
-; CHECK-NEXT: leaq stuff+8(%rax), %r9
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: leaq stuff(%rax), %rdi
+; CHECK-NEXT: pextrd $1, %xmm0, %r9d
+; CHECK-NEXT: leaq stuff(%r9), %rsi
+; CHECK-NEXT: pextrd $2, %xmm0, %ecx
+; CHECK-NEXT: pextrd $3, %xmm0, %r8d
+; CHECK-NEXT: leaq stuff(%rcx), %rdx
+; CHECK-NEXT: leaq stuff(%r8), %rcx
+; CHECK-NEXT: leaq stuff+8(%rax), %r8
+; CHECK-NEXT: leaq stuff+8(%r9), %r9
; CHECK-NEXT: callq toto at PLT
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
diff --git a/llvm/test/CodeGen/X86/pr23603.ll b/llvm/test/CodeGen/X86/pr23603.ll
index 514f4e5a91194..22440c890ba1d 100644
--- a/llvm/test/CodeGen/X86/pr23603.ll
+++ b/llvm/test/CodeGen/X86/pr23603.ll
@@ -9,14 +9,14 @@ define void @f(ptr %x, i32 %c32, ptr %y) nounwind {
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: movq %rdx, %r14
+; CHECK-NEXT: movq %rdx, %rbx
; CHECK-NEXT: movl %esi, %ebp
-; CHECK-NEXT: movl (%rdi), %ebx
+; CHECK-NEXT: movl (%rdi), %r14d
; CHECK-NEXT: callq free_v at PLT
; CHECK-NEXT: testl %ebp, %ebp
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.1: # %left
-; CHECK-NEXT: movl %ebx, (%r14)
+; CHECK-NEXT: movl %r14d, (%rbx)
; CHECK-NEXT: .LBB0_2: # %merge
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll
index 52d6bbd5a54d3..61f67e959ec6d 100644
--- a/llvm/test/CodeGen/X86/pr29112.ll
+++ b/llvm/test/CodeGen/X86/pr29112.ll
@@ -10,11 +10,11 @@ define <4 x float> @bar(ptr %a1p, ptr %a2p, <4 x float> %a3, <4 x float> %a4, <1
; CHECK: # %bb.0:
; CHECK-NEXT: subq $72, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 80
-; CHECK-NEXT: vmovaps %xmm1, %xmm9
-; CHECK-NEXT: vmovaps {{.*#+}} xmm14 = [4,22,1,17]
-; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14
-; CHECK-NEXT: vmovaps {{.*#+}} xmm10 = [4,30,1,22]
-; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10
+; CHECK-NEXT: vmovaps %xmm1, %xmm13
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4,22,1,17]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: vmovaps {{.*#+}} xmm12 = [4,30,1,22]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm12
; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8
; CHECK-NEXT: vmovaps {{.*#+}} xmm7 = <5,20,u,u>
@@ -22,37 +22,36 @@ define <4 x float> @bar(ptr %a1p, ptr %a2p, <4 x float> %a3, <4 x float> %a4, <1
; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5
-; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm1[0,1,2],xmm3[1]
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm9
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm9[0,1],xmm2[1],xmm9[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1]
; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3]
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm11 = xmm2[3,3,3,3]
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
; CHECK-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1]
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8
-; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm7[0,1],xmm2[1],xmm7[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm11[0,1,2],xmm3[1]
+; CHECK-NEXT: vaddps %xmm8, %xmm11, %xmm8
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,1],xmm2[3,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
-; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2
-; CHECK-NEXT: vmovaps %xmm13, %xmm1
-; CHECK-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vaddps %xmm10, %xmm13, %xmm10
-; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm3
-; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vaddps %xmm1, %xmm12, %xmm9
+; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3
+; CHECK-NEXT: vaddps %xmm0, %xmm10, %xmm0
; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %xmm10, (%rsp)
-; CHECK-NEXT: vmovaps %xmm9, %xmm3
+; CHECK-NEXT: vmovaps %xmm9, (%rsp)
+; CHECK-NEXT: vmovaps %xmm13, %xmm3
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo at PLT
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll
index 0381edf8cbfa1..d9671aa04f460 100644
--- a/llvm/test/CodeGen/X86/pr32329.ll
+++ b/llvm/test/CodeGen/X86/pr32329.ll
@@ -68,26 +68,26 @@ define void @foo() local_unnamed_addr {
;
; X64-LABEL: foo:
; X64: # %bb.0: # %entry
-; X64-NEXT: movsbl var_27(%rip), %r9d
-; X64-NEXT: movzwl var_2(%rip), %r8d
+; X64-NEXT: movsbl var_27(%rip), %eax
+; X64-NEXT: movzwl var_2(%rip), %edx
; X64-NEXT: movl var_310(%rip), %ecx
-; X64-NEXT: imull %r9d, %ecx
+; X64-NEXT: imull %eax, %ecx
; X64-NEXT: addl var_24(%rip), %ecx
; X64-NEXT: movl $4194303, %esi # imm = 0x3FFFFF
; X64-NEXT: andl obj(%rip), %esi
; X64-NEXT: leal (%rsi,%rsi), %edi
-; X64-NEXT: subl %r9d, %edi
-; X64-NEXT: movl %edi, %edx
-; X64-NEXT: subl %r8d, %edx
-; X64-NEXT: imull %edx, %ecx
+; X64-NEXT: subl %eax, %edi
+; X64-NEXT: movl %edi, %r8d
+; X64-NEXT: subl %edx, %r8d
+; X64-NEXT: imull %r8d, %ecx
; X64-NEXT: addb $113, %cl
-; X64-NEXT: movl $9, %eax
+; X64-NEXT: movl $9, %edx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: shlq %cl, %rax
-; X64-NEXT: movq %rax, var_50(%rip)
-; X64-NEXT: cmpl %esi, %edx
+; X64-NEXT: shlq %cl, %rdx
+; X64-NEXT: movq %rdx, var_50(%rip)
+; X64-NEXT: cmpl %esi, %r8d
; X64-NEXT: setge var_205(%rip)
-; X64-NEXT: imull %r9d, %edi
+; X64-NEXT: imull %eax, %edi
; X64-NEXT: movb %dil, var_218(%rip)
; X64-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/pr35316.ll b/llvm/test/CodeGen/X86/pr35316.ll
index ef98997e4d929..95e45a631aaaf 100644
--- a/llvm/test/CodeGen/X86/pr35316.ll
+++ b/llvm/test/CodeGen/X86/pr35316.ll
@@ -26,19 +26,19 @@ define void @foo() {
; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movl $0, b(%rip)
-; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d
-; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi
; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d
; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl a(%rip)
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: movl c(%rip), %eax
; CHECK-NEXT: cltd
-; CHECK-NEXT: idivl %esi
+; CHECK-NEXT: idivl %r8d
; CHECK-NEXT: andl %edi, %eax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: andl %r8d, %eax
+; CHECK-NEXT: andl %esi, %eax
; CHECK-NEXT: movl %eax, (%rax)
; CHECK-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/pr38185.ll b/llvm/test/CodeGen/X86/pr38185.ll
index 370f162ae4956..e062302d4f0b5 100644
--- a/llvm/test/CodeGen/X86/pr38185.ll
+++ b/llvm/test/CodeGen/X86/pr38185.ll
@@ -8,19 +8,19 @@ define void @foo(ptr %a, ptr %b, ptr noalias %c, i64 %s) {
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %loop
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %r9
-; CHECK-NEXT: cmpq %rcx, %r9
+; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: cmpq %rcx, %rax
; CHECK-NEXT: je .LBB0_3
; CHECK-NEXT: # %bb.2: # %body
; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: movl $1, (%rdx,%r9,4)
-; CHECK-NEXT: movzbl (%rdi,%r9,4), %r8d
-; CHECK-NEXT: movzbl (%rsi,%r9,4), %eax
-; CHECK-NEXT: andl %r8d, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: movl %eax, (%rdi,%r9,4)
-; CHECK-NEXT: incq %r9
-; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $1, (%rdx,%rax,4)
+; CHECK-NEXT: movzbl (%rdi,%rax,4), %r8d
+; CHECK-NEXT: movzbl (%rsi,%rax,4), %r9d
+; CHECK-NEXT: andl %r8d, %r9d
+; CHECK-NEXT: andl $1, %r9d
+; CHECK-NEXT: movl %r9d, (%rdi,%rax,4)
+; CHECK-NEXT: incq %rax
+; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: .LBB0_3: # %endloop
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr38217.ll b/llvm/test/CodeGen/X86/pr38217.ll
index 404855b6222ff..19f9fed78f312 100644
--- a/llvm/test/CodeGen/X86/pr38217.ll
+++ b/llvm/test/CodeGen/X86/pr38217.ll
@@ -9,31 +9,30 @@ define dso_local void @_Z12d2s_bufferedmPc(i64, i8* nocapture) {
; CHECK-NEXT: cmpq $10000, %rdi # imm = 0x2710
; CHECK-NEXT: jb .LBB0_3
; CHECK-NEXT: # %bb.1: # %.preheader
-; CHECK-NEXT: movq %rdi, %r9
-; CHECK-NEXT: xorl %r10d, %r10d
+; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: movabsq $3777893186295716171, %r8 # imm = 0x346DC5D63886594B
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movq %r9, %rax
+; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: mulq %r8
; CHECK-NEXT: shrq $11, %rdx
; CHECK-NEXT: imulq $10000, %rdx, %rax # imm = 0x2710
-; CHECK-NEXT: movq %r9, %rdi
-; CHECK-NEXT: subq %rax, %rdi
-; CHECK-NEXT: imulq $1374389535, %rdi, %rax # imm = 0x51EB851F
+; CHECK-NEXT: movq %rdi, %r9
+; CHECK-NEXT: subq %rax, %r9
+; CHECK-NEXT: imulq $1374389535, %r9, %rax # imm = 0x51EB851F
; CHECK-NEXT: shrq $37, %rax
-; CHECK-NEXT: imull $100, %eax, %ecx
-; CHECK-NEXT: subl %ecx, %edi
-; CHECK-NEXT: movl %r10d, %r11d
-; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: subq %r11, %rcx
-; CHECK-NEXT: movzwl _ZL11DIGIT_TABLE(%rdi,%rdi), %edi
-; CHECK-NEXT: movw %di, -1(%rcx)
+; CHECK-NEXT: imull $100, %eax, %r10d
+; CHECK-NEXT: subl %r10d, %r9d
+; CHECK-NEXT: movl %ecx, %r10d
+; CHECK-NEXT: movq %rsi, %r11
+; CHECK-NEXT: subq %r10, %r11
+; CHECK-NEXT: movzwl _ZL11DIGIT_TABLE(%r9,%r9), %r9d
+; CHECK-NEXT: movw %r9w, -1(%r11)
; CHECK-NEXT: movzwl _ZL11DIGIT_TABLE(%rax,%rax), %eax
-; CHECK-NEXT: movw %ax, -3(%rcx)
-; CHECK-NEXT: addl $4, %r10d
-; CHECK-NEXT: cmpq $99999999, %r9 # imm = 0x5F5E0FF
-; CHECK-NEXT: movq %rdx, %r9
+; CHECK-NEXT: movw %ax, -3(%r11)
+; CHECK-NEXT: addl $4, %ecx
+; CHECK-NEXT: cmpq $99999999, %rdi # imm = 0x5F5E0FF
+; CHECK-NEXT: movq %rdx, %rdi
; CHECK-NEXT: ja .LBB0_2
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll
index 2cbced7053e87..7214ec75b0cf7 100644
--- a/llvm/test/CodeGen/X86/pr43820.ll
+++ b/llvm/test/CodeGen/X86/pr43820.ll
@@ -12,311 +12,310 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT: bswapq %rbp
-; CHECK-NEXT: movq %rbp, %r11
-; CHECK-NEXT: shrq $4, %r11
-; CHECK-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
-; CHECK-NEXT: andq %rsi, %r11
-; CHECK-NEXT: andq %rsi, %rbp
-; CHECK-NEXT: shlq $4, %rbp
-; CHECK-NEXT: orq %r11, %rbp
-; CHECK-NEXT: movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
-; CHECK-NEXT: movq %rbp, %r12
-; CHECK-NEXT: andq %rdi, %r12
-; CHECK-NEXT: shrq $2, %rbp
-; CHECK-NEXT: andq %rdi, %rbp
-; CHECK-NEXT: leaq (%rbp,%r12,4), %rbp
-; CHECK-NEXT: movabsq $6148914691230924800, %r12 # imm = 0x5555555555000000
-; CHECK-NEXT: movq %rbp, %r13
-; CHECK-NEXT: andq %r12, %r13
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: andq %r12, %rbp
-; CHECK-NEXT: leaq (%rbp,%r13,2), %rax
-; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: bswapq %rbx
-; CHECK-NEXT: movq %rbx, %rbp
-; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: andq %rsi, %rbp
-; CHECK-NEXT: andq %rsi, %rbx
-; CHECK-NEXT: shlq $4, %rbx
-; CHECK-NEXT: orq %rbp, %rbx
-; CHECK-NEXT: movq %rbx, %rbp
-; CHECK-NEXT: andq %rdi, %rbp
-; CHECK-NEXT: shrq $2, %rbx
-; CHECK-NEXT: andq %rdi, %rbx
-; CHECK-NEXT: leaq (%rbx,%rbp,4), %rbp
-; CHECK-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555
-; CHECK-NEXT: movq %rbp, %r12
-; CHECK-NEXT: andq %rbx, %r12
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: andq %rbx, %rbp
-; CHECK-NEXT: leaq (%rbp,%r12,2), %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; CHECK-NEXT: bswapq %r12
+; CHECK-NEXT: movq %r12, %r10
+; CHECK-NEXT: shrq $4, %r10
+; CHECK-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
+; CHECK-NEXT: andq %rax, %r10
+; CHECK-NEXT: andq %rax, %r12
+; CHECK-NEXT: shlq $4, %r12
+; CHECK-NEXT: orq %r10, %r12
+; CHECK-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333
+; CHECK-NEXT: movq %r12, %r13
+; CHECK-NEXT: andq %r10, %r13
+; CHECK-NEXT: shrq $2, %r12
+; CHECK-NEXT: andq %r10, %r12
+; CHECK-NEXT: leaq (%r12,%r13,4), %r12
+; CHECK-NEXT: movabsq $6148914691230924800, %r13 # imm = 0x5555555555000000
+; CHECK-NEXT: movq %r12, %rbp
+; CHECK-NEXT: andq %r13, %rbp
+; CHECK-NEXT: shrq %r12
+; CHECK-NEXT: andq %r13, %r12
+; CHECK-NEXT: leaq (%r12,%rbp,2), %rsi
+; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: bswapq %r14
+; CHECK-NEXT: movq %r14, %r12
+; CHECK-NEXT: shrq $4, %r12
+; CHECK-NEXT: movq %rax, %rbp
+; CHECK-NEXT: andq %rax, %r12
+; CHECK-NEXT: andq %rax, %r14
+; CHECK-NEXT: shlq $4, %r14
+; CHECK-NEXT: orq %r12, %r14
+; CHECK-NEXT: movq %r14, %r12
+; CHECK-NEXT: andq %r10, %r12
+; CHECK-NEXT: shrq $2, %r14
+; CHECK-NEXT: andq %r10, %r14
+; CHECK-NEXT: leaq (%r14,%r12,4), %r12
+; CHECK-NEXT: movabsq $6148914691236517205, %r14 # imm = 0x5555555555555555
+; CHECK-NEXT: movq %r12, %r13
+; CHECK-NEXT: andq %r14, %r13
+; CHECK-NEXT: shrq %r12
+; CHECK-NEXT: andq %r14, %r12
+; CHECK-NEXT: leaq (%r12,%r13,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r15
-; CHECK-NEXT: movq %r15, %rbp
-; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: andq %rsi, %rbp
-; CHECK-NEXT: andq %rsi, %r15
+; CHECK-NEXT: movq %r15, %r12
+; CHECK-NEXT: shrq $4, %r12
+; CHECK-NEXT: andq %rbp, %r12
+; CHECK-NEXT: andq %rbp, %r15
; CHECK-NEXT: shlq $4, %r15
-; CHECK-NEXT: orq %rbp, %r15
-; CHECK-NEXT: movq %r15, %rbp
-; CHECK-NEXT: andq %rdi, %rbp
+; CHECK-NEXT: orq %r12, %r15
+; CHECK-NEXT: movq %r15, %r12
+; CHECK-NEXT: andq %r10, %r12
; CHECK-NEXT: shrq $2, %r15
-; CHECK-NEXT: andq %rdi, %r15
-; CHECK-NEXT: leaq (%r15,%rbp,4), %rbp
-; CHECK-NEXT: movq %rbp, %r15
-; CHECK-NEXT: andq %rbx, %r15
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: andq %rbx, %rbp
-; CHECK-NEXT: leaq (%rbp,%r15,2), %rax
+; CHECK-NEXT: andq %r10, %r15
+; CHECK-NEXT: leaq (%r15,%r12,4), %r15
+; CHECK-NEXT: movq %r15, %r12
+; CHECK-NEXT: andq %r14, %r12
+; CHECK-NEXT: shrq %r15
+; CHECK-NEXT: andq %r14, %r15
+; CHECK-NEXT: leaq (%r15,%r12,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: bswapq %r14
-; CHECK-NEXT: movq %r14, %rbp
-; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: andq %rsi, %rbp
-; CHECK-NEXT: andq %rsi, %r14
-; CHECK-NEXT: shlq $4, %r14
-; CHECK-NEXT: orq %rbp, %r14
-; CHECK-NEXT: movq %r14, %rbp
-; CHECK-NEXT: andq %rdi, %rbp
-; CHECK-NEXT: shrq $2, %r14
-; CHECK-NEXT: andq %rdi, %r14
-; CHECK-NEXT: leaq (%r14,%rbp,4), %rbp
-; CHECK-NEXT: movq %rbp, %r14
-; CHECK-NEXT: andq %rbx, %r14
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: andq %rbx, %rbp
-; CHECK-NEXT: leaq (%rbp,%r14,2), %rax
+; CHECK-NEXT: bswapq %rbx
+; CHECK-NEXT: movq %rbx, %r15
+; CHECK-NEXT: shrq $4, %r15
+; CHECK-NEXT: andq %rbp, %r15
+; CHECK-NEXT: andq %rbp, %rbx
+; CHECK-NEXT: shlq $4, %rbx
+; CHECK-NEXT: orq %r15, %rbx
+; CHECK-NEXT: movq %rbx, %r15
+; CHECK-NEXT: andq %r10, %r15
+; CHECK-NEXT: shrq $2, %rbx
+; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: leaq (%rbx,%r15,4), %rbx
+; CHECK-NEXT: movq %rbx, %r15
+; CHECK-NEXT: andq %r14, %r15
+; CHECK-NEXT: shrq %rbx
+; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: leaq (%rbx,%r15,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: bswapq %r10
-; CHECK-NEXT: movq %r10, %rbp
-; CHECK-NEXT: shrq $4, %rbp
-; CHECK-NEXT: andq %rsi, %rbp
-; CHECK-NEXT: andq %rsi, %r10
-; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: orq %rbp, %r10
-; CHECK-NEXT: movq %r10, %rbp
-; CHECK-NEXT: andq %rdi, %rbp
-; CHECK-NEXT: shrq $2, %r10
-; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: leaq (%r10,%rbp,4), %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: andq %rbx, %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,2), %rax
+; CHECK-NEXT: bswapq %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: shrq $4, %rbx
+; CHECK-NEXT: andq %rbp, %rbx
+; CHECK-NEXT: andq %rbp, %rdi
+; CHECK-NEXT: shlq $4, %rdi
+; CHECK-NEXT: orq %rbx, %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: shrq $2, %rdi
+; CHECK-NEXT: andq %r10, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: shrq %rdi
+; CHECK-NEXT: andq %r14, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; CHECK-NEXT: bswapq %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: shrq $4, %r10
-; CHECK-NEXT: andq %rsi, %r10
-; CHECK-NEXT: andq %rsi, %rbp
-; CHECK-NEXT: shlq $4, %rbp
-; CHECK-NEXT: orq %r10, %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: shrq $2, %rbp
-; CHECK-NEXT: andq %rdi, %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
-; CHECK-NEXT: movq %rbp, %r10
-; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: shrq %rbp
-; CHECK-NEXT: andq %rbx, %rbp
-; CHECK-NEXT: leaq (%rbp,%r10,2), %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: bswapq %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: shrq $4, %rbx
+; CHECK-NEXT: andq %rbp, %rbx
+; CHECK-NEXT: andq %rbp, %rdi
+; CHECK-NEXT: shlq $4, %rdi
+; CHECK-NEXT: orq %rbx, %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: shrq $2, %rdi
+; CHECK-NEXT: andq %r10, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: shrq %rdi
+; CHECK-NEXT: andq %r14, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: bswapq %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: shrq $4, %r14
-; CHECK-NEXT: andq %rsi, %r14
-; CHECK-NEXT: andq %rsi, %r10
-; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: orq %r14, %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: andq %rdi, %r14
-; CHECK-NEXT: shrq $2, %r10
-; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: movq %rdi, %rbp
-; CHECK-NEXT: leaq (%r10,%r14,4), %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: andq %rbx, %r14
-; CHECK-NEXT: shrq %r10
-; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: leaq (%r10,%r14,2), %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: bswapq %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: shrq $4, %rbx
+; CHECK-NEXT: andq %rbp, %rbx
+; CHECK-NEXT: andq %rbp, %rdi
+; CHECK-NEXT: shlq $4, %rdi
+; CHECK-NEXT: orq %rbx, %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: shrq $2, %rdi
+; CHECK-NEXT: andq %r10, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: shrq %rdi
+; CHECK-NEXT: andq %r14, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: bswapq %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: shrq $4, %r14
-; CHECK-NEXT: andq %rsi, %r14
-; CHECK-NEXT: andq %rsi, %r10
-; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: orq %r14, %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: andq %rdi, %r14
-; CHECK-NEXT: shrq $2, %r10
-; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: leaq (%r10,%r14,4), %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: andq %rbx, %r14
-; CHECK-NEXT: shrq %r10
-; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: leaq (%r10,%r14,2), %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: bswapq %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: shrq $4, %rbx
+; CHECK-NEXT: andq %rbp, %rbx
+; CHECK-NEXT: andq %rbp, %rdi
+; CHECK-NEXT: shlq $4, %rdi
+; CHECK-NEXT: orq %rbx, %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: shrq $2, %rdi
+; CHECK-NEXT: andq %r10, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: shrq %rdi
+; CHECK-NEXT: andq %r14, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: bswapq %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: shrq $4, %r14
-; CHECK-NEXT: andq %rsi, %r14
-; CHECK-NEXT: andq %rsi, %r10
-; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: orq %r14, %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: andq %rdi, %r14
-; CHECK-NEXT: shrq $2, %r10
-; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: leaq (%r10,%r14,4), %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: andq %rbx, %r14
-; CHECK-NEXT: shrq %r10
-; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: leaq (%r10,%r14,2), %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: bswapq %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: shrq $4, %rbx
+; CHECK-NEXT: andq %rbp, %rbx
+; CHECK-NEXT: andq %rbp, %rdi
+; CHECK-NEXT: shlq $4, %rdi
+; CHECK-NEXT: orq %rbx, %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: shrq $2, %rdi
+; CHECK-NEXT: andq %r10, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: shrq %rdi
+; CHECK-NEXT: andq %r14, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: bswapq %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: shrq $4, %r14
-; CHECK-NEXT: andq %rsi, %r14
-; CHECK-NEXT: andq %rsi, %r10
-; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: orq %r14, %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: andq %rdi, %r14
-; CHECK-NEXT: shrq $2, %r10
-; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: leaq (%r10,%r14,4), %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: andq %rbx, %r14
-; CHECK-NEXT: shrq %r10
-; CHECK-NEXT: andq %rbx, %r10
-; CHECK-NEXT: leaq (%r10,%r14,2), %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: bswapq %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: shrq $4, %rbx
+; CHECK-NEXT: andq %rbp, %rbx
+; CHECK-NEXT: andq %rbp, %rdi
+; CHECK-NEXT: shlq $4, %rdi
+; CHECK-NEXT: orq %rbx, %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: shrq $2, %rdi
+; CHECK-NEXT: andq %r10, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: shrq %rdi
+; CHECK-NEXT: andq %r14, %rdi
+; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: bswapq %r10
-; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: bswapq %rdi
+; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: andq %rsi, %r10
-; CHECK-NEXT: shlq $4, %r10
-; CHECK-NEXT: orq %rax, %r10
-; CHECK-NEXT: movq %r10, %rax
-; CHECK-NEXT: andq %rdi, %rax
-; CHECK-NEXT: shrq $2, %r10
-; CHECK-NEXT: andq %rdi, %r10
-; CHECK-NEXT: leaq (%r10,%rax,4), %rax
-; CHECK-NEXT: movq %rax, %r10
-; CHECK-NEXT: andq %rbx, %r10
+; CHECK-NEXT: andq %rbp, %rax
+; CHECK-NEXT: andq %rbp, %rdi
+; CHECK-NEXT: shlq $4, %rdi
+; CHECK-NEXT: orq %rax, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: andq %r10, %rax
+; CHECK-NEXT: shrq $2, %rdi
+; CHECK-NEXT: andq %r10, %rdi
+; CHECK-NEXT: leaq (%rdi,%rax,4), %rax
+; CHECK-NEXT: movq %rax, %rdi
+; CHECK-NEXT: andq %r14, %rdi
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %rbx, %rax
-; CHECK-NEXT: leaq (%rax,%r10,2), %rax
+; CHECK-NEXT: andq %r14, %rax
+; CHECK-NEXT: leaq (%rax,%rdi,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r9
; CHECK-NEXT: movq %r9, %rax
; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: andq %rsi, %r9
+; CHECK-NEXT: andq %rbp, %rax
+; CHECK-NEXT: andq %rbp, %r9
; CHECK-NEXT: shlq $4, %r9
; CHECK-NEXT: orq %rax, %r9
; CHECK-NEXT: movq %r9, %rax
-; CHECK-NEXT: andq %rdi, %rax
+; CHECK-NEXT: andq %r10, %rax
; CHECK-NEXT: shrq $2, %r9
-; CHECK-NEXT: andq %rdi, %r9
+; CHECK-NEXT: andq %r10, %r9
; CHECK-NEXT: leaq (%r9,%rax,4), %rax
; CHECK-NEXT: movq %rax, %r9
-; CHECK-NEXT: andq %rbx, %r9
+; CHECK-NEXT: andq %r14, %r9
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %rbx, %rax
+; CHECK-NEXT: andq %r14, %rax
; CHECK-NEXT: leaq (%rax,%r9,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r8
; CHECK-NEXT: movq %r8, %rax
; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: andq %rsi, %r8
+; CHECK-NEXT: andq %rbp, %rax
+; CHECK-NEXT: andq %rbp, %r8
; CHECK-NEXT: shlq $4, %r8
; CHECK-NEXT: orq %rax, %r8
; CHECK-NEXT: movq %r8, %rax
-; CHECK-NEXT: andq %rdi, %rax
+; CHECK-NEXT: andq %r10, %rax
; CHECK-NEXT: shrq $2, %r8
-; CHECK-NEXT: andq %rdi, %r8
+; CHECK-NEXT: andq %r10, %r8
; CHECK-NEXT: leaq (%r8,%rax,4), %rax
; CHECK-NEXT: movq %rax, %r8
-; CHECK-NEXT: andq %rbx, %r8
+; CHECK-NEXT: andq %r14, %r8
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %rbx, %rax
-; CHECK-NEXT: leaq (%rax,%r8,2), %rax
-; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: andq %r14, %rax
+; CHECK-NEXT: leaq (%rax,%r8,2), %r8
; CHECK-NEXT: bswapq %rcx
; CHECK-NEXT: movq %rcx, %rax
; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: andq %rsi, %rcx
+; CHECK-NEXT: andq %rbp, %rax
+; CHECK-NEXT: andq %rbp, %rcx
; CHECK-NEXT: shlq $4, %rcx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: andq %rdi, %rax
+; CHECK-NEXT: andq %r10, %rax
; CHECK-NEXT: shrq $2, %rcx
-; CHECK-NEXT: andq %rdi, %rcx
+; CHECK-NEXT: andq %r10, %rcx
; CHECK-NEXT: leaq (%rcx,%rax,4), %rax
; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: andq %rbx, %rcx
+; CHECK-NEXT: andq %r14, %rcx
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %rbx, %rax
+; CHECK-NEXT: andq %r14, %rax
; CHECK-NEXT: leaq (%rax,%rcx,2), %r12
; CHECK-NEXT: bswapq %rdx
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: andq %rsi, %rdx
+; CHECK-NEXT: andq %rbp, %rax
+; CHECK-NEXT: andq %rbp, %rdx
; CHECK-NEXT: shlq $4, %rdx
; CHECK-NEXT: orq %rax, %rdx
; CHECK-NEXT: movq %rdx, %rax
-; CHECK-NEXT: andq %rdi, %rax
+; CHECK-NEXT: andq %r10, %rax
; CHECK-NEXT: shrq $2, %rdx
-; CHECK-NEXT: andq %rdi, %rdx
+; CHECK-NEXT: andq %r10, %rdx
; CHECK-NEXT: leaq (%rdx,%rax,4), %rax
; CHECK-NEXT: movq %rax, %rdx
-; CHECK-NEXT: andq %rbx, %rdx
+; CHECK-NEXT: andq %r14, %rdx
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %rbx, %rax
+; CHECK-NEXT: andq %r14, %rax
; CHECK-NEXT: leaq (%rax,%rdx,2), %rdi
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: bswapq %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq $4, %rcx
-; CHECK-NEXT: andq %rsi, %rcx
-; CHECK-NEXT: andq %rsi, %rax
+; CHECK-NEXT: andq %rbp, %rcx
+; CHECK-NEXT: andq %rbp, %rax
; CHECK-NEXT: shlq $4, %rax
; CHECK-NEXT: orq %rcx, %rax
; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: andq %rbp, %rcx
+; CHECK-NEXT: andq %r10, %rcx
; CHECK-NEXT: shrq $2, %rax
-; CHECK-NEXT: andq %rbp, %rax
+; CHECK-NEXT: andq %r10, %rax
; CHECK-NEXT: leaq (%rax,%rcx,4), %rax
-; CHECK-NEXT: movq %rax, %rsi
-; CHECK-NEXT: andq %rbx, %rsi
+; CHECK-NEXT: movq %rax, %r10
+; CHECK-NEXT: andq %r14, %r10
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %rbx, %rax
-; CHECK-NEXT: leaq (%rax,%rsi,2), %rsi
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; CHECK-NEXT: andq %r14, %rax
+; CHECK-NEXT: leaq (%rax,%r10,2), %rdx
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %rax, %rdx
+; CHECK-NEXT: shrdq $24, %rax, %rsi
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rcx, %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -337,14 +336,15 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: shrdq $24, %r10, %r11
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r9, %r10
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %r8, %r9
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %rax, %r8
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; CHECK-NEXT: shrdq $24, %rcx, %r9
+; CHECK-NEXT: movq %r8, %rax
+; CHECK-NEXT: shrdq $24, %r8, %rcx
+; CHECK-NEXT: movq %rcx, %r8
; CHECK-NEXT: shrdq $24, %r12, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrdq $24, %rdi, %r12
-; CHECK-NEXT: shrdq $24, %rsi, %rdi
+; CHECK-NEXT: shrdq $24, %rdx, %rdi
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: movq %rdi, 112(%rax)
; CHECK-NEXT: movq %r12, 104(%rax)
@@ -362,10 +362,10 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: movq %rcx, 16(%rax)
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-NEXT: movq %rcx, 8(%rax)
-; CHECK-NEXT: movq %rdx, (%rax)
-; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: shrq $56, %rsi
-; CHECK-NEXT: movb %sil, 124(%rax)
+; CHECK-NEXT: movq %rsi, (%rax)
+; CHECK-NEXT: movq %rdx, %rcx
+; CHECK-NEXT: shrq $56, %rdx
+; CHECK-NEXT: movb %dl, 124(%rax)
; CHECK-NEXT: shrq $24, %rcx
; CHECK-NEXT: movl %ecx, 120(%rax)
; CHECK-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll
index 70d4928639a67..f1a409a29db35 100644
--- a/llvm/test/CodeGen/X86/pr45563-2.ll
+++ b/llvm/test/CodeGen/X86/pr45563-2.ll
@@ -203,8 +203,8 @@ define <17 x float> @mload_split17(<17 x i1> %mask, ptr %addr, <17 x float> %dst
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; CHECK-NEXT: vmovd %esi, %xmm3
; CHECK-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3
; CHECK-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3
@@ -218,7 +218,7 @@ define <17 x float> @mload_split17(<17 x i1> %mask, ptr %addr, <17 x float> %dst
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; CHECK-NEXT: vmaskmovps (%r10), %ymm3, %ymm4
+; CHECK-NEXT: vmaskmovps (%rdi), %ymm3, %ymm4
; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm2, %ymm2
; CHECK-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3
@@ -233,12 +233,12 @@ define <17 x float> @mload_split17(<17 x i1> %mask, ptr %addr, <17 x float> %dst
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; CHECK-NEXT: vmaskmovps 32(%r10), %ymm3, %ymm4
+; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm3, %ymm4
; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm1, %ymm1
-; CHECK-NEXT: vmovd %edi, %xmm3
+; CHECK-NEXT: vmovd %r10d, %xmm3
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; CHECK-NEXT: vpslld $31, %xmm3, %xmm3
-; CHECK-NEXT: vmaskmovps 64(%r10), %ymm3, %ymm4
+; CHECK-NEXT: vmaskmovps 64(%rdi), %ymm3, %ymm4
; CHECK-NEXT: vblendvps %xmm3, %xmm4, %xmm0, %xmm0
; CHECK-NEXT: vmovss %xmm0, 64(%rax)
; CHECK-NEXT: vmovaps %ymm1, 32(%rax)
@@ -276,8 +276,8 @@ define <23 x float> @mload_split23(<23 x i1> %mask, ptr %addr, <23 x float> %dst
; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; CHECK-NEXT: vmovd %esi, %xmm4
; CHECK-NEXT: vpinsrb $2, %edx, %xmm4, %xmm4
; CHECK-NEXT: vpinsrb $4, %ecx, %xmm4, %xmm4
@@ -291,7 +291,7 @@ define <23 x float> @mload_split23(<23 x i1> %mask, ptr %addr, <23 x float> %dst
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vpslld $31, %xmm4, %xmm4
; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; CHECK-NEXT: vmaskmovps (%r10), %ymm4, %ymm5
+; CHECK-NEXT: vmaskmovps (%rdi), %ymm4, %ymm5
; CHECK-NEXT: vblendvps %ymm4, %ymm5, %ymm3, %ymm3
; CHECK-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4
@@ -306,9 +306,9 @@ define <23 x float> @mload_split23(<23 x i1> %mask, ptr %addr, <23 x float> %dst
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vpslld $31, %xmm4, %xmm4
; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; CHECK-NEXT: vmaskmovps 32(%r10), %ymm4, %ymm5
+; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm4, %ymm5
; CHECK-NEXT: vblendvps %ymm4, %ymm5, %ymm2, %ymm2
-; CHECK-NEXT: vmovd %edi, %xmm4
+; CHECK-NEXT: vmovd %r10d, %xmm4
; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4
; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm4, %xmm4
; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4
@@ -320,7 +320,7 @@ define <23 x float> @mload_split23(<23 x i1> %mask, ptr %addr, <23 x float> %dst
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vpslld $31, %xmm4, %xmm4
; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6
-; CHECK-NEXT: vmaskmovps 64(%r10), %ymm6, %ymm6
+; CHECK-NEXT: vmaskmovps 64(%rdi), %ymm6, %ymm6
; CHECK-NEXT: vmovaps %ymm2, 32(%rax)
; CHECK-NEXT: vextractf128 $1, %ymm6, %xmm2
; CHECK-NEXT: vblendvps %xmm4, %xmm2, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/pr45563.ll b/llvm/test/CodeGen/X86/pr45563.ll
index 43c209ff781e4..214ae56b50c01 100644
--- a/llvm/test/CodeGen/X86/pr45563.ll
+++ b/llvm/test/CodeGen/X86/pr45563.ll
@@ -26,33 +26,33 @@ define <16 x double> @bug45563(ptr %addr, <16 x double> %dst, <16 x i64> %e, <16
; CHECK-NEXT: vmovdqa 128(%rbp), %xmm10
; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8
; CHECK-NEXT: vpcmpgtq %xmm7, %xmm9, %xmm7
-; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm8
-; CHECK-NEXT: vextractf128 $1, %ymm6, %xmm10
+; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
+; CHECK-NEXT: vextractf128 $1, %ymm6, %xmm8
; CHECK-NEXT: vmovdqa 80(%rbp), %xmm9
-; CHECK-NEXT: vmovdqa 96(%rbp), %xmm7
-; CHECK-NEXT: vpcmpgtq %xmm10, %xmm7, %xmm7
+; CHECK-NEXT: vmovdqa 96(%rbp), %xmm10
+; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8
; CHECK-NEXT: vpcmpgtq %xmm6, %xmm9, %xmm6
-; CHECK-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm10
-; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm7
+; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
+; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm8
; CHECK-NEXT: vmovdqa 48(%rbp), %xmm9
-; CHECK-NEXT: vmovdqa 64(%rbp), %xmm6
-; CHECK-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6
+; CHECK-NEXT: vmovdqa 64(%rbp), %xmm10
+; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8
; CHECK-NEXT: vpcmpgtq %xmm5, %xmm9, %xmm5
-; CHECK-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; CHECK-NEXT: vextractf128 $1, %ymm4, %xmm6
+; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5
+; CHECK-NEXT: vextractf128 $1, %ymm4, %xmm8
; CHECK-NEXT: vmovdqa 16(%rbp), %xmm9
-; CHECK-NEXT: vmovdqa 32(%rbp), %xmm7
-; CHECK-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6
+; CHECK-NEXT: vmovdqa 32(%rbp), %xmm10
+; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8
; CHECK-NEXT: vpcmpgtq %xmm4, %xmm9, %xmm4
-; CHECK-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; CHECK-NEXT: vmaskmovpd (%rdi), %ymm4, %ymm6
-; CHECK-NEXT: vblendvpd %ymm4, %ymm6, %ymm0, %ymm0
+; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4
+; CHECK-NEXT: vmaskmovpd (%rdi), %ymm4, %ymm8
+; CHECK-NEXT: vblendvpd %ymm4, %ymm8, %ymm0, %ymm0
; CHECK-NEXT: vmaskmovpd 32(%rdi), %ymm5, %ymm4
; CHECK-NEXT: vblendvpd %ymm5, %ymm4, %ymm1, %ymm1
-; CHECK-NEXT: vmaskmovpd 64(%rdi), %ymm10, %ymm4
-; CHECK-NEXT: vblendvpd %ymm10, %ymm4, %ymm2, %ymm2
-; CHECK-NEXT: vmaskmovpd 96(%rdi), %ymm8, %ymm4
-; CHECK-NEXT: vblendvpd %ymm8, %ymm4, %ymm3, %ymm3
+; CHECK-NEXT: vmaskmovpd 64(%rdi), %ymm6, %ymm4
+; CHECK-NEXT: vblendvpd %ymm6, %ymm4, %ymm2, %ymm2
+; CHECK-NEXT: vmaskmovpd 96(%rdi), %ymm7, %ymm4
+; CHECK-NEXT: vblendvpd %ymm7, %ymm4, %ymm3, %ymm3
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
diff --git a/llvm/test/CodeGen/X86/pr45995.ll b/llvm/test/CodeGen/X86/pr45995.ll
index f79e8a5676eb7..f9a9ab768f1a0 100644
--- a/llvm/test/CodeGen/X86/pr45995.ll
+++ b/llvm/test/CodeGen/X86/pr45995.ll
@@ -15,21 +15,21 @@ define void @extracter0([4 x <4 x i1>] %matrix) {
; CHECK-NEXT: .cfi_offset rbp, -16
; CHECK-NEXT: vpslld xmm0, xmm0, 31
; CHECK-NEXT: vmovmskps edi, xmm0
+; CHECK-NEXT: mov ebx, edi
+; CHECK-NEXT: shr bl, 3
; CHECK-NEXT: mov ebp, edi
-; CHECK-NEXT: shr bpl, 3
+; CHECK-NEXT: and bpl, 4
+; CHECK-NEXT: shr bpl, 2
; CHECK-NEXT: mov r14d, edi
-; CHECK-NEXT: and r14b, 4
-; CHECK-NEXT: shr r14b, 2
-; CHECK-NEXT: mov ebx, edi
-; CHECK-NEXT: and bl, 2
-; CHECK-NEXT: shr bl
-; CHECK-NEXT: call print_i1 at PLT
-; CHECK-NEXT: movzx edi, bl
+; CHECK-NEXT: and r14b, 2
+; CHECK-NEXT: shr r14b
; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: movzx edi, r14b
; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: movzx edi, bpl
; CHECK-NEXT: call print_i1 at PLT
+; CHECK-NEXT: movzx edi, bl
+; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: pop rbx
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: pop r14
@@ -73,39 +73,39 @@ define void @extracter1([4 x <4 x i1>] %matrix) {
; CHECK-NEXT: .cfi_offset r15, -24
; CHECK-NEXT: .cfi_offset rbp, -16
; CHECK-NEXT: vpslld xmm1, xmm1, 31
-; CHECK-NEXT: vmovmskps ebp, xmm1
-; CHECK-NEXT: mov eax, ebp
+; CHECK-NEXT: vmovmskps ebx, xmm1
+; CHECK-NEXT: mov eax, ebx
; CHECK-NEXT: shr al, 3
; CHECK-NEXT: mov byte ptr [rsp + 7], al # 1-byte Spill
-; CHECK-NEXT: mov r15d, ebp
-; CHECK-NEXT: and r15b, 4
-; CHECK-NEXT: shr r15b, 2
-; CHECK-NEXT: mov r13d, ebp
-; CHECK-NEXT: and r13b, 2
-; CHECK-NEXT: shr r13b
+; CHECK-NEXT: mov r14d, ebx
+; CHECK-NEXT: and r14b, 4
+; CHECK-NEXT: shr r14b, 2
+; CHECK-NEXT: mov r15d, ebx
+; CHECK-NEXT: and r15b, 2
+; CHECK-NEXT: shr r15b
; CHECK-NEXT: vpslld xmm0, xmm0, 31
; CHECK-NEXT: vmovmskps edi, xmm0
; CHECK-NEXT: mov r12d, edi
; CHECK-NEXT: shr r12b, 3
-; CHECK-NEXT: mov ebx, edi
-; CHECK-NEXT: and bl, 4
-; CHECK-NEXT: shr bl, 2
-; CHECK-NEXT: mov r14d, edi
-; CHECK-NEXT: and r14b, 2
-; CHECK-NEXT: shr r14b
+; CHECK-NEXT: mov r13d, edi
+; CHECK-NEXT: and r13b, 4
+; CHECK-NEXT: shr r13b, 2
+; CHECK-NEXT: mov ebp, edi
+; CHECK-NEXT: and bpl, 2
+; CHECK-NEXT: shr bpl
; CHECK-NEXT: call print_i1 at PLT
-; CHECK-NEXT: movzx edi, r14b
+; CHECK-NEXT: movzx edi, bpl
; CHECK-NEXT: call print_i1 at PLT
-; CHECK-NEXT: movzx edi, bl
+; CHECK-NEXT: movzx edi, r13b
; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: movzx edi, r12b
; CHECK-NEXT: call print_i1 at PLT
-; CHECK-NEXT: mov edi, ebp
-; CHECK-NEXT: call print_i1 at PLT
-; CHECK-NEXT: movzx edi, r13b
+; CHECK-NEXT: mov edi, ebx
; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: movzx edi, r15b
; CHECK-NEXT: call print_i1 at PLT
+; CHECK-NEXT: movzx edi, r14b
+; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: movzx edi, byte ptr [rsp + 7] # 1-byte Folded Reload
; CHECK-NEXT: call print_i1 at PLT
; CHECK-NEXT: add rsp, 8
diff --git a/llvm/test/CodeGen/X86/pr46877.ll b/llvm/test/CodeGen/X86/pr46877.ll
index 9022203ab88b4..cfd39672ef910 100644
--- a/llvm/test/CodeGen/X86/pr46877.ll
+++ b/llvm/test/CodeGen/X86/pr46877.ll
@@ -7,20 +7,20 @@ define void @tester(float %0, float %1, float %2, float %3, float %4, float %5,
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovaps %xmm3, %xmm15
; CHECK-NEXT: vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero
; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero
; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm12
-; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm3
-; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm3 = (xmm15 * xmm3) - xmm0
+; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm10
+; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm10 = (xmm3 * xmm10) - xmm0
; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0
; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm2
-; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm3
+; CHECK-NEXT: vmulss %xmm2, %xmm10, %xmm4
; CHECK-NEXT: vmulss %xmm6, %xmm12, %xmm2
; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm7 * xmm2) + xmm0
-; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm5
+; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm5
; CHECK-NEXT: vmulss %xmm0, %xmm13, %xmm2
; CHECK-NEXT: vmovss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: vmulss %xmm2, %xmm10, %xmm2
+; CHECK-NEXT: vmulss %xmm2, %xmm9, %xmm2
; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm3
; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0
@@ -34,12 +34,13 @@ define void @tester(float %0, float %1, float %2, float %3, float %4, float %5,
; CHECK-NEXT: vmulss %xmm5, %xmm2, %xmm2
; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm5
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm10 * xmm5) + xmm0
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm9 * xmm5) + xmm0
; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm4
-; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmulss %xmm0, %xmm9, %xmm6
-; CHECK-NEXT: vmovss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: vmulss %xmm6, %xmm14, %xmm5
+; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm8
+; CHECK-NEXT: vmovss %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: vmovaps %xmm5, %xmm10
+; CHECK-NEXT: vmulss %xmm14, %xmm8, %xmm5
; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0
; CHECK-NEXT: vmulss %xmm5, %xmm2, %xmm2
; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
@@ -66,7 +67,7 @@ define void @tester(float %0, float %1, float %2, float %3, float %4, float %5,
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm4
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm4
; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0
; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
@@ -78,44 +79,43 @@ define void @tester(float %0, float %1, float %2, float %3, float %4, float %5,
; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm1
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm1
; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm1 * xmm4) + xmm0
-; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm10
-; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm6
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm4
-; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm13, %xmm5
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm7 * xmm5) + xmm0
-; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm4
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm5
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm5
-; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm12
-; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm7 * xmm5) + xmm0
+; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm4
+; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm5
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm10
+; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm13, %xmm12
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm12 = -(xmm7 * xmm12) + xmm0
+; CHECK-NEXT: vmulss %xmm12, %xmm10, %xmm10
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4
+; CHECK-NEXT: vmulss %xmm4, %xmm10, %xmm12
+; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm7 * xmm2) + xmm0
; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmulss %xmm6, %xmm3, %xmm2
-; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm10 * xmm2) + xmm0
+; CHECK-NEXT: vmulss %xmm5, %xmm3, %xmm6
+; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm9
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm1
; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0
-; CHECK-NEXT: vmulss %xmm2, %xmm5, %xmm2
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm5
-; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0
+; CHECK-NEXT: vmulss %xmm6, %xmm2, %xmm2
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm6
+; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0
; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1
-; CHECK-NEXT: vmulss %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vmulss %xmm6, %xmm1, %xmm1
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm2
; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0
; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm4
-; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm3
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm13 * xmm3) + xmm0
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm2
+; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm3
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm12 * xmm3) + xmm0
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm2
; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0
; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero
@@ -127,11 +127,11 @@ define void @tester(float %0, float %1, float %2, float %3, float %4, float %5,
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm2
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1
-; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Reload
-; CHECK-NEXT: # xmm12 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2
+; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Reload
+; CHECK-NEXT: # xmm10 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm2
; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm12 * xmm2) + xmm0
; CHECK-NEXT: vmulss %xmm7, %xmm2, %xmm2
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1
@@ -142,63 +142,63 @@ define void @tester(float %0, float %1, float %2, float %3, float %4, float %5,
; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0
; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1
-; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm2
+; CHECK-NEXT: vmulss %xmm0, %xmm6, %xmm2
; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm10 * xmm2) + xmm0
+; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0
; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm5 * xmm3) + xmm0
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm6 * xmm3) + xmm0
; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm8
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm3
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm4
; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0
; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm10
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm5 * xmm11) + xmm0
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm2
+; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm6 * xmm11) + xmm0
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm2
; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0
-; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm4
+; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4
; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0
; CHECK-NEXT: vmulss %xmm2, %xmm11, %xmm2
; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2
; CHECK-NEXT: vfnmadd132ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 4-byte Folded Reload
; CHECK-NEXT: # xmm14 = -(xmm14 * mem) + xmm0
-; CHECK-NEXT: vmulss %xmm2, %xmm14, %xmm9
+; CHECK-NEXT: vmulss %xmm2, %xmm14, %xmm4
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm2
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm11
-; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm11 = -(xmm11 * mem) + xmm0
-; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm7
-; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 4-byte Folded Reload
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm1
-; CHECK-NEXT: vmulss %xmm6, %xmm15, %xmm6
-; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm6 = -(xmm3 * xmm6) + xmm0
-; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm4
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm3 * xmm4) + xmm0
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm7 = -(xmm3 * xmm7) + xmm0
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm3 * xmm5) + xmm0
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2
-; CHECK-NEXT: vmulss %xmm0, %xmm13, %xmm3
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0
+; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmulss %xmm6, %xmm13, %xmm7
+; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 4-byte Folded Reload
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm8
+; CHECK-NEXT: vmulss %xmm5, %xmm15, %xmm5
+; CHECK-NEXT: vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm11 * xmm5) + xmm0
+; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm9
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm9 = -(xmm11 * xmm9) + xmm0
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm7 = -(xmm11 * xmm7) + xmm0
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm6 = -(xmm11 * xmm6) + xmm0
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm10
+; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm11
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm11, %xmm11
; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm12 * xmm3) + xmm0
-; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm12 * xmm2) + xmm0
-; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm15 * xmm1) - xmm0
-; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm8 = -(xmm8 * mem) + xmm0
-; CHECK-NEXT: vmulss %xmm8, %xmm9, %xmm0
-; CHECK-NEXT: vmulss %xmm6, %xmm0, %xmm0
-; CHECK-NEXT: vmulss %xmm4, %xmm0, %xmm0
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm12 * xmm11) + xmm0
+; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm12 * xmm10) + xmm0
+; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm8 = (xmm15 * xmm8) - xmm0
+; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0
+; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vmulss %xmm5, %xmm0, %xmm0
+; CHECK-NEXT: vmulss %xmm0, %xmm9, %xmm0
; CHECK-NEXT: vmulss %xmm7, %xmm0, %xmm0
-; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm4
-; CHECK-NEXT: vmulss %xmm0, %xmm4, %xmm0
-; CHECK-NEXT: vmulss %xmm5, %xmm11, %xmm4
-; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm3
-; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT: vmulss %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vmulss %xmm6, %xmm2, %xmm1
+; CHECK-NEXT: vmulss %xmm1, %xmm11, %xmm1
+; CHECK-NEXT: vmulss %xmm1, %xmm10, %xmm1
; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vmulss %xmm1, %xmm8, %xmm1
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovss %xmm0, (%rdi)
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr47299.ll b/llvm/test/CodeGen/X86/pr47299.ll
index b0324486e8135..7cb1112402ebe 100644
--- a/llvm/test/CodeGen/X86/pr47299.ll
+++ b/llvm/test/CodeGen/X86/pr47299.ll
@@ -15,38 +15,38 @@ define <7 x i1> @create_mask7(i64 %0) {
; CHECK-NEXT: vpbroadcastq zmm0, rsi
; CHECK-NEXT: vpcmpnleuq k0, zmm0, zmmword ptr [rip + {{\.?LCPI[0-9]+_[0-9]+}}]
; CHECK-NEXT: kshiftrb k1, k0, 6
-; CHECK-NEXT: kmovd r8d, k1
+; CHECK-NEXT: kmovd ecx, k1
; CHECK-NEXT: kshiftrb k1, k0, 5
-; CHECK-NEXT: kmovd r9d, k1
+; CHECK-NEXT: kmovd edx, k1
; CHECK-NEXT: kshiftrb k1, k0, 4
-; CHECK-NEXT: kmovd r10d, k1
+; CHECK-NEXT: kmovd esi, k1
; CHECK-NEXT: kshiftrb k1, k0, 3
; CHECK-NEXT: kmovd edi, k1
; CHECK-NEXT: kshiftrb k1, k0, 2
-; CHECK-NEXT: kmovd ecx, k1
+; CHECK-NEXT: kmovd r8d, k1
; CHECK-NEXT: kshiftrb k1, k0, 1
-; CHECK-NEXT: kmovd edx, k1
-; CHECK-NEXT: kmovd esi, k0
-; CHECK-NEXT: and sil, 1
-; CHECK-NEXT: and dl, 1
-; CHECK-NEXT: add dl, dl
-; CHECK-NEXT: or dl, sil
-; CHECK-NEXT: and cl, 1
-; CHECK-NEXT: shl cl, 2
-; CHECK-NEXT: or cl, dl
-; CHECK-NEXT: and dil, 1
-; CHECK-NEXT: shl dil, 3
-; CHECK-NEXT: or dil, cl
+; CHECK-NEXT: kmovd r9d, k1
+; CHECK-NEXT: kmovd r10d, k0
; CHECK-NEXT: and r10b, 1
-; CHECK-NEXT: shl r10b, 4
-; CHECK-NEXT: or r10b, dil
; CHECK-NEXT: and r9b, 1
-; CHECK-NEXT: shl r9b, 5
+; CHECK-NEXT: add r9b, r9b
; CHECK-NEXT: or r9b, r10b
-; CHECK-NEXT: shl r8b, 6
+; CHECK-NEXT: and r8b, 1
+; CHECK-NEXT: shl r8b, 2
; CHECK-NEXT: or r8b, r9b
-; CHECK-NEXT: and r8b, 127
-; CHECK-NEXT: mov byte ptr [rax], r8b
+; CHECK-NEXT: and dil, 1
+; CHECK-NEXT: shl dil, 3
+; CHECK-NEXT: or dil, r8b
+; CHECK-NEXT: and sil, 1
+; CHECK-NEXT: shl sil, 4
+; CHECK-NEXT: or sil, dil
+; CHECK-NEXT: and dl, 1
+; CHECK-NEXT: shl dl, 5
+; CHECK-NEXT: or dl, sil
+; CHECK-NEXT: shl cl, 6
+; CHECK-NEXT: or cl, dl
+; CHECK-NEXT: and cl, 127
+; CHECK-NEXT: mov byte ptr [rax], cl
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret
%2 = call <7 x i1> @llvm.get.active.lane.mask.v7i1.i64(i64 0, i64 %0)
diff --git a/llvm/test/CodeGen/X86/pr47857.ll b/llvm/test/CodeGen/X86/pr47857.ll
index 8f72bd6940031..419e839a5d974 100644
--- a/llvm/test/CodeGen/X86/pr47857.ll
+++ b/llvm/test/CodeGen/X86/pr47857.ll
@@ -8,29 +8,29 @@ define void @PR47857(ptr noalias nocapture writeonly sret(%"struct.std::array")
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq (%rdx), %r9
-; CHECK-NEXT: movq 8(%rdx), %r8
+; CHECK-NEXT: movq 8(%rdx), %rcx
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: addq (%rsi), %r9
-; CHECK-NEXT: adcq 8(%rsi), %r8
-; CHECK-NEXT: movq 16(%rdx), %rcx
-; CHECK-NEXT: adcq 16(%rsi), %rcx
+; CHECK-NEXT: adcq 8(%rsi), %rcx
+; CHECK-NEXT: movq 16(%rdx), %r8
+; CHECK-NEXT: adcq 16(%rsi), %r8
; CHECK-NEXT: movq 24(%rdx), %rdx
; CHECK-NEXT: adcq 24(%rsi), %rdx
; CHECK-NEXT: sbbq %rdi, %rdi
; CHECK-NEXT: andl $38, %edi
; CHECK-NEXT: addq %rdi, %r9
-; CHECK-NEXT: adcq $0, %r8
; CHECK-NEXT: adcq $0, %rcx
+; CHECK-NEXT: adcq $0, %r8
; CHECK-NEXT: adcq $0, %rdx
; CHECK-NEXT: sbbq %rdi, %rdi
; CHECK-NEXT: andl $38, %edi
; CHECK-NEXT: addq %r9, %rdi
-; CHECK-NEXT: adcq $0, %r8
; CHECK-NEXT: adcq $0, %rcx
+; CHECK-NEXT: adcq $0, %r8
; CHECK-NEXT: adcq $0, %rdx
; CHECK-NEXT: movq %rdi, (%rax)
-; CHECK-NEXT: movq %r8, 8(%rax)
-; CHECK-NEXT: movq %rcx, 16(%rax)
+; CHECK-NEXT: movq %rcx, 8(%rax)
+; CHECK-NEXT: movq %r8, 16(%rax)
; CHECK-NEXT: movq %rdx, 24(%rax)
; CHECK-NEXT: retq
%4 = load i64, ptr %1, align 8
diff --git a/llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll b/llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll
index 5494032571618..553aaeb8a3e9c 100644
--- a/llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll
+++ b/llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll
@@ -10,17 +10,17 @@ define void @test(i1 %c, ptr %p, ptr noalias %p2) nounwind {
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movq %rdx, %rbx
-; CHECK-NEXT: movl %edi, %r14d
-; CHECK-NEXT: movq (%rsi), %rbp
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movq (%rsi), %r14
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: jmpq *.LJTI0_0(,%rax,8)
; CHECK-NEXT: .LBB0_1: # %split.3
-; CHECK-NEXT: testb $1, %r14b
+; CHECK-NEXT: testb $1, %bpl
; CHECK-NEXT: je .LBB0_3
; CHECK-NEXT: # %bb.2: # %clobber
; CHECK-NEXT: callq clobber at PLT
; CHECK-NEXT: .LBB0_3: # %sink
-; CHECK-NEXT: movq %rbp, (%rbx)
+; CHECK-NEXT: movq %r14, (%rbx)
; CHECK-NEXT: .LBB0_4: # %latch
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll
index 4a58ea49608db..bc824d4fd5e33 100644
--- a/llvm/test/CodeGen/X86/promote-cmp.ll
+++ b/llvm/test/CodeGen/X86/promote-cmp.ll
@@ -8,34 +8,34 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) {
; SSE2-LABEL: PR45808:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm3, %xmm9
-; SSE2-NEXT: pxor %xmm4, %xmm9
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm8
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pxor %xmm4, %xmm7
+; SSE2-NEXT: movdqa %xmm6, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: pxor %xmm4, %xmm8
; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm10
-; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm7[0,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm6[1,3]
; SSE2-NEXT: andps %xmm10, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm8[1,3]
-; SSE2-NEXT: orps %xmm4, %xmm5
+; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm7[1,3]
+; SSE2-NEXT: orps %xmm4, %xmm9
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; SSE2-NEXT: pxor %xmm9, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,1,3,3]
; SSE2-NEXT: psllq $63, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 37a18c530535e..1afadf814c76b 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -820,41 +820,41 @@ vector.ph:
define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; SSE2OR3-LABEL: test14:
; SSE2OR3: # %bb.0: # %vector.ph
-; SSE2OR3-NEXT: pxor %xmm8, %xmm8
+; SSE2OR3-NEXT: pxor %xmm5, %xmm5
; SSE2OR3-NEXT: movdqa %xmm0, %xmm6
-; SSE2OR3-NEXT: movdqa %xmm4, %xmm9
-; SSE2OR3-NEXT: movdqa %xmm3, %xmm10
-; SSE2OR3-NEXT: movdqa %xmm2, %xmm7
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2OR3-NEXT: pand %xmm5, %xmm4
-; SSE2OR3-NEXT: pand %xmm5, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm4, %xmm7
+; SSE2OR3-NEXT: movdqa %xmm3, %xmm8
+; SSE2OR3-NEXT: movdqa %xmm2, %xmm9
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2OR3-NEXT: pand %xmm10, %xmm4
+; SSE2OR3-NEXT: pand %xmm10, %xmm3
; SSE2OR3-NEXT: packuswb %xmm4, %xmm3
; SSE2OR3-NEXT: movdqa %xmm1, %xmm4
-; SSE2OR3-NEXT: pand %xmm5, %xmm2
-; SSE2OR3-NEXT: pand %xmm5, %xmm1
+; SSE2OR3-NEXT: pand %xmm10, %xmm2
+; SSE2OR3-NEXT: pand %xmm10, %xmm1
; SSE2OR3-NEXT: packuswb %xmm2, %xmm1
; SSE2OR3-NEXT: packuswb %xmm3, %xmm1
; SSE2OR3-NEXT: psubb %xmm0, %xmm1
; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
-; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
; SSE2OR3-NEXT: movdqa %xmm2, %xmm0
-; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
-; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
-; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
+; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
; SSE2OR3-NEXT: movdqa %xmm6, %xmm3
-; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2OR3-NEXT: pxor %xmm5, %xmm9
+; SSE2OR3-NEXT: pxor %xmm5, %xmm7
; SSE2OR3-NEXT: por %xmm5, %xmm6
-; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm6
-; SSE2OR3-NEXT: pxor %xmm5, %xmm10
+; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6
+; SSE2OR3-NEXT: pxor %xmm5, %xmm8
; SSE2OR3-NEXT: por %xmm5, %xmm3
-; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm3
+; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm3
; SSE2OR3-NEXT: packssdw %xmm6, %xmm3
-; SSE2OR3-NEXT: pxor %xmm5, %xmm7
+; SSE2OR3-NEXT: pxor %xmm5, %xmm9
; SSE2OR3-NEXT: por %xmm5, %xmm2
-; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm2
+; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm2
; SSE2OR3-NEXT: pxor %xmm5, %xmm4
; SSE2OR3-NEXT: por %xmm5, %xmm0
; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm0
@@ -866,27 +866,27 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; SSE41-LABEL: test14:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; SSE41-NEXT: pmaxud %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; SSE41-NEXT: pmaxud %xmm4, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm8
; SSE41-NEXT: pcmpeqd %xmm9, %xmm9
-; SSE41-NEXT: pxor %xmm9, %xmm6
+; SSE41-NEXT: pxor %xmm9, %xmm8
; SSE41-NEXT: pmaxud %xmm3, %xmm7
; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
; SSE41-NEXT: pxor %xmm9, %xmm7
-; SSE41-NEXT: packssdw %xmm6, %xmm7
+; SSE41-NEXT: packssdw %xmm8, %xmm7
; SSE41-NEXT: pmaxud %xmm1, %xmm5
; SSE41-NEXT: pcmpeqd %xmm1, %xmm5
; SSE41-NEXT: pxor %xmm9, %xmm5
-; SSE41-NEXT: pmaxud %xmm2, %xmm8
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm8
-; SSE41-NEXT: pxor %xmm9, %xmm8
-; SSE41-NEXT: packssdw %xmm8, %xmm5
+; SSE41-NEXT: pmaxud %xmm2, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE41-NEXT: pxor %xmm9, %xmm6
+; SSE41-NEXT: packssdw %xmm6, %xmm5
; SSE41-NEXT: packsswb %xmm7, %xmm5
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE41-NEXT: pand %xmm6, %xmm4
@@ -1608,66 +1608,66 @@ vector.ph:
define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; SSE2OR3-LABEL: psubus_8i64_max:
; SSE2OR3: # %bb.0: # %vector.ph
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
; SSE2OR3-NEXT: movdqa %xmm2, %xmm7
-; SSE2OR3-NEXT: pxor %xmm8, %xmm7
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
-; SSE2OR3-NEXT: movdqa %xmm9, %xmm6
-; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2OR3-NEXT: pand %xmm5, %xmm7
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2OR3-NEXT: por %xmm7, %xmm5
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535]
-; SSE2OR3-NEXT: pand %xmm5, %xmm2
-; SSE2OR3-NEXT: pandn %xmm10, %xmm5
-; SSE2OR3-NEXT: por %xmm2, %xmm5
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
+; SSE2OR3-NEXT: pxor %xmm5, %xmm7
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991]
+; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
+; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
+; SSE2OR3-NEXT: pand %xmm8, %xmm7
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSE2OR3-NEXT: por %xmm7, %xmm8
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535]
+; SSE2OR3-NEXT: pand %xmm8, %xmm2
+; SSE2OR3-NEXT: pandn %xmm7, %xmm8
+; SSE2OR3-NEXT: por %xmm2, %xmm8
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2OR3-NEXT: movdqa %xmm1, %xmm5
-; SSE2OR3-NEXT: pxor %xmm8, %xmm5
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm6
-; SSE2OR3-NEXT: movdqa %xmm9, %xmm7
-; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
-; SSE2OR3-NEXT: pand %xmm6, %xmm5
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2OR3-NEXT: por %xmm5, %xmm6
-; SSE2OR3-NEXT: pand %xmm6, %xmm1
-; SSE2OR3-NEXT: pandn %xmm10, %xmm6
-; SSE2OR3-NEXT: por %xmm1, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
+; SSE2OR3-NEXT: movdqa %xmm1, %xmm8
+; SSE2OR3-NEXT: pxor %xmm5, %xmm8
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2OR3-NEXT: movdqa %xmm6, %xmm10
+; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm10
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2]
+; SSE2OR3-NEXT: pand %xmm9, %xmm8
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2OR3-NEXT: por %xmm8, %xmm9
+; SSE2OR3-NEXT: pand %xmm9, %xmm1
+; SSE2OR3-NEXT: pandn %xmm7, %xmm9
+; SSE2OR3-NEXT: por %xmm1, %xmm9
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3]
; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2OR3-NEXT: movdqa %xmm4, %xmm2
-; SSE2OR3-NEXT: pxor %xmm8, %xmm2
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE2OR3-NEXT: movdqa %xmm9, %xmm6
-; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; SSE2OR3-NEXT: pand %xmm5, %xmm2
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2OR3-NEXT: por %xmm2, %xmm5
-; SSE2OR3-NEXT: pand %xmm5, %xmm4
-; SSE2OR3-NEXT: pandn %xmm10, %xmm5
-; SSE2OR3-NEXT: por %xmm4, %xmm5
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
+; SSE2OR3-NEXT: pxor %xmm5, %xmm2
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
+; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8
+; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
+; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm9
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2]
+; SSE2OR3-NEXT: pand %xmm8, %xmm2
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSE2OR3-NEXT: por %xmm2, %xmm8
+; SSE2OR3-NEXT: pand %xmm8, %xmm4
+; SSE2OR3-NEXT: pandn %xmm7, %xmm8
+; SSE2OR3-NEXT: por %xmm4, %xmm8
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
; SSE2OR3-NEXT: movdqa %xmm3, %xmm4
-; SSE2OR3-NEXT: pxor %xmm8, %xmm4
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm9
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,2,2]
-; SSE2OR3-NEXT: pand %xmm5, %xmm4
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
+; SSE2OR3-NEXT: pxor %xmm5, %xmm4
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
+; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8
+; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; SSE2OR3-NEXT: pand %xmm8, %xmm4
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2OR3-NEXT: por %xmm4, %xmm5
; SSE2OR3-NEXT: pand %xmm5, %xmm3
-; SSE2OR3-NEXT: pandn %xmm10, %xmm5
+; SSE2OR3-NEXT: pandn %xmm7, %xmm5
; SSE2OR3-NEXT: por %xmm3, %xmm5
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
@@ -1678,56 +1678,56 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
;
; SSE41-LABEL: psubus_8i64_max:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT: movdqa %xmm5, %xmm7
+; SSE41-NEXT: pxor %xmm8, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991]
+; SSE41-NEXT: movdqa %xmm6, %xmm7
; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm5, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd {{.*#+}} xmm6 = [65535,65535]
-; SSE41-NEXT: movapd %xmm6, %xmm10
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535]
+; SSE41-NEXT: movapd %xmm7, %xmm9
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9
; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm5, %xmm4
+; SSE41-NEXT: pxor %xmm8, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm5, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm6, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm6, %xmm4
+; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: movapd %xmm7, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
-; SSE41-NEXT: packusdw %xmm10, %xmm4
+; SSE41-NEXT: packusdw %xmm9, %xmm4
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm5, %xmm3
+; SSE41-NEXT: pxor %xmm8, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm5, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm6, %xmm3
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm7, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: pxor %xmm1, %xmm9
-; SSE41-NEXT: movdqa %xmm5, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: pxor %xmm1, %xmm8
+; SSE41-NEXT: movdqa %xmm6, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
-; SSE41-NEXT: packusdw %xmm3, %xmm6
-; SSE41-NEXT: packusdw %xmm4, %xmm6
-; SSE41-NEXT: psubusw %xmm6, %xmm8
-; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
+; SSE41-NEXT: packusdw %xmm3, %xmm7
+; SSE41-NEXT: packusdw %xmm4, %xmm7
+; SSE41-NEXT: psubusw %xmm7, %xmm5
+; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: psubus_8i64_max:
@@ -1793,47 +1793,47 @@ vector.ph:
define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
; SSE2OR3-LABEL: psubus_16i32_max:
; SSE2OR3: # %bb.0: # %vector.ph
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
; SSE2OR3-NEXT: movdqa %xmm3, %xmm8
-; SSE2OR3-NEXT: pxor %xmm9, %xmm8
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2OR3-NEXT: movdqa %xmm10, %xmm6
-; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6
+; SSE2OR3-NEXT: pxor %xmm7, %xmm8
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
+; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9
; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8
-; SSE2OR3-NEXT: pand %xmm6, %xmm3
-; SSE2OR3-NEXT: pxor %xmm8, %xmm6
-; SSE2OR3-NEXT: por %xmm3, %xmm6
-; SSE2OR3-NEXT: pslld $16, %xmm6
-; SSE2OR3-NEXT: psrad $16, %xmm6
+; SSE2OR3-NEXT: pand %xmm9, %xmm3
+; SSE2OR3-NEXT: pxor %xmm8, %xmm9
+; SSE2OR3-NEXT: por %xmm3, %xmm9
+; SSE2OR3-NEXT: pslld $16, %xmm9
+; SSE2OR3-NEXT: psrad $16, %xmm9
; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
-; SSE2OR3-NEXT: pxor %xmm9, %xmm3
-; SSE2OR3-NEXT: movdqa %xmm10, %xmm7
-; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE2OR3-NEXT: pand %xmm7, %xmm2
-; SSE2OR3-NEXT: pxor %xmm8, %xmm7
-; SSE2OR3-NEXT: por %xmm2, %xmm7
-; SSE2OR3-NEXT: pslld $16, %xmm7
-; SSE2OR3-NEXT: psrad $16, %xmm7
-; SSE2OR3-NEXT: packssdw %xmm6, %xmm7
-; SSE2OR3-NEXT: psubusw %xmm7, %xmm0
+; SSE2OR3-NEXT: pxor %xmm7, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm6, %xmm10
+; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm10
+; SSE2OR3-NEXT: pand %xmm10, %xmm2
+; SSE2OR3-NEXT: pxor %xmm8, %xmm10
+; SSE2OR3-NEXT: por %xmm2, %xmm10
+; SSE2OR3-NEXT: pslld $16, %xmm10
+; SSE2OR3-NEXT: psrad $16, %xmm10
+; SSE2OR3-NEXT: packssdw %xmm9, %xmm10
+; SSE2OR3-NEXT: psubusw %xmm10, %xmm0
; SSE2OR3-NEXT: movdqa %xmm5, %xmm2
-; SSE2OR3-NEXT: pxor %xmm9, %xmm2
-; SSE2OR3-NEXT: movdqa %xmm10, %xmm3
+; SSE2OR3-NEXT: pxor %xmm7, %xmm2
+; SSE2OR3-NEXT: movdqa %xmm6, %xmm3
; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3
; SSE2OR3-NEXT: pand %xmm3, %xmm5
; SSE2OR3-NEXT: pxor %xmm8, %xmm3
; SSE2OR3-NEXT: por %xmm5, %xmm3
; SSE2OR3-NEXT: pslld $16, %xmm3
; SSE2OR3-NEXT: psrad $16, %xmm3
-; SSE2OR3-NEXT: pxor %xmm4, %xmm9
-; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10
-; SSE2OR3-NEXT: pxor %xmm10, %xmm8
-; SSE2OR3-NEXT: pand %xmm4, %xmm10
-; SSE2OR3-NEXT: por %xmm8, %xmm10
-; SSE2OR3-NEXT: pslld $16, %xmm10
-; SSE2OR3-NEXT: psrad $16, %xmm10
-; SSE2OR3-NEXT: packssdw %xmm3, %xmm10
-; SSE2OR3-NEXT: psubusw %xmm10, %xmm1
+; SSE2OR3-NEXT: pxor %xmm4, %xmm7
+; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6
+; SSE2OR3-NEXT: pxor %xmm6, %xmm8
+; SSE2OR3-NEXT: pand %xmm4, %xmm6
+; SSE2OR3-NEXT: por %xmm8, %xmm6
+; SSE2OR3-NEXT: pslld $16, %xmm6
+; SSE2OR3-NEXT: psrad $16, %xmm6
+; SSE2OR3-NEXT: packssdw %xmm3, %xmm6
+; SSE2OR3-NEXT: psubusw %xmm6, %xmm1
; SSE2OR3-NEXT: retq
;
; SSE41-LABEL: psubus_16i32_max:
@@ -2672,130 +2672,130 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) {
define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) {
; SSE2OR3-LABEL: test33:
; SSE2OR3: # %bb.0:
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
-; SSE2OR3-NEXT: movdqa %xmm3, %xmm6
-; SSE2OR3-NEXT: pxor %xmm8, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm10
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
-; SSE2OR3-NEXT: movdqa %xmm9, %xmm7
-; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2OR3-NEXT: pand %xmm10, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2OR3-NEXT: por %xmm6, %xmm7
-; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE2OR3-NEXT: pand %xmm7, %xmm3
-; SSE2OR3-NEXT: pxor %xmm10, %xmm7
-; SSE2OR3-NEXT: por %xmm3, %xmm7
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE2OR3-NEXT: movdqa %xmm3, %xmm8
+; SSE2OR3-NEXT: pxor %xmm6, %xmm8
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455]
+; SSE2OR3-NEXT: movdqa %xmm7, %xmm10
+; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm10
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2]
+; SSE2OR3-NEXT: pand %xmm9, %xmm8
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2OR3-NEXT: por %xmm8, %xmm9
+; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE2OR3-NEXT: pand %xmm9, %xmm3
+; SSE2OR3-NEXT: pxor %xmm8, %xmm9
+; SSE2OR3-NEXT: por %xmm3, %xmm9
; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
-; SSE2OR3-NEXT: pxor %xmm8, %xmm3
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm11
-; SSE2OR3-NEXT: movdqa %xmm9, %xmm6
-; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; SSE2OR3-NEXT: pand %xmm11, %xmm3
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2OR3-NEXT: por %xmm3, %xmm6
-; SSE2OR3-NEXT: pand %xmm6, %xmm2
-; SSE2OR3-NEXT: pxor %xmm10, %xmm6
-; SSE2OR3-NEXT: por %xmm2, %xmm6
-; SSE2OR3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
+; SSE2OR3-NEXT: pxor %xmm6, %xmm3
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3]
+; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2OR3-NEXT: movdqa %xmm7, %xmm11
+; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm11
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2]
+; SSE2OR3-NEXT: pand %xmm10, %xmm3
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSE2OR3-NEXT: por %xmm3, %xmm10
+; SSE2OR3-NEXT: pand %xmm10, %xmm2
+; SSE2OR3-NEXT: pxor %xmm8, %xmm10
+; SSE2OR3-NEXT: por %xmm2, %xmm10
+; SSE2OR3-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,2]
; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
-; SSE2OR3-NEXT: psubd %xmm6, %xmm2
-; SSE2OR3-NEXT: pxor %xmm8, %xmm6
-; SSE2OR3-NEXT: pxor %xmm8, %xmm0
-; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE2OR3-NEXT: psubd %xmm10, %xmm2
+; SSE2OR3-NEXT: pxor %xmm6, %xmm10
+; SSE2OR3-NEXT: pxor %xmm6, %xmm0
+; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm0
; SSE2OR3-NEXT: pand %xmm2, %xmm0
; SSE2OR3-NEXT: movdqa %xmm5, %xmm2
-; SSE2OR3-NEXT: pxor %xmm8, %xmm2
+; SSE2OR3-NEXT: pxor %xmm6, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm3
-; SSE2OR3-NEXT: movdqa %xmm9, %xmm6
-; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
+; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm7, %xmm9
+; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm9
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2]
; SSE2OR3-NEXT: pand %xmm3, %xmm2
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
; SSE2OR3-NEXT: por %xmm2, %xmm3
; SSE2OR3-NEXT: pand %xmm3, %xmm5
-; SSE2OR3-NEXT: pxor %xmm10, %xmm3
+; SSE2OR3-NEXT: pxor %xmm8, %xmm3
; SSE2OR3-NEXT: por %xmm5, %xmm3
; SSE2OR3-NEXT: movdqa %xmm4, %xmm2
-; SSE2OR3-NEXT: pxor %xmm8, %xmm2
+; SSE2OR3-NEXT: pxor %xmm6, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm9
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2]
+; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm7
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
; SSE2OR3-NEXT: pand %xmm5, %xmm2
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2OR3-NEXT: por %xmm2, %xmm5
-; SSE2OR3-NEXT: pxor %xmm5, %xmm10
+; SSE2OR3-NEXT: pxor %xmm5, %xmm8
; SSE2OR3-NEXT: pand %xmm4, %xmm5
-; SSE2OR3-NEXT: por %xmm10, %xmm5
+; SSE2OR3-NEXT: por %xmm8, %xmm5
; SSE2OR3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2]
; SSE2OR3-NEXT: movdqa %xmm1, %xmm2
; SSE2OR3-NEXT: psubd %xmm5, %xmm2
-; SSE2OR3-NEXT: pxor %xmm8, %xmm5
-; SSE2OR3-NEXT: pxor %xmm8, %xmm1
+; SSE2OR3-NEXT: pxor %xmm6, %xmm5
+; SSE2OR3-NEXT: pxor %xmm6, %xmm1
; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm1
; SSE2OR3-NEXT: pand %xmm2, %xmm1
; SSE2OR3-NEXT: retq
;
; SSE41-LABEL: test33:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm0, %xmm6
; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT: movdqa %xmm11, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: movdqa %xmm11, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295]
-; SSE41-NEXT: movapd %xmm7, %xmm10
+; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455]
+; SSE41-NEXT: movdqa %xmm7, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm7, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE41-NEXT: movapd %xmm8, %xmm10
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm11, %xmm3
+; SSE41-NEXT: movdqa %xmm7, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm11, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm7, %xmm11
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm11
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm3
+; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: movapd %xmm8, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
-; SSE41-NEXT: pmaxud %xmm3, %xmm8
-; SSE41-NEXT: psubd %xmm3, %xmm8
+; SSE41-NEXT: pmaxud %xmm3, %xmm6
+; SSE41-NEXT: psubd %xmm3, %xmm6
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm11, %xmm2
+; SSE41-NEXT: movdqa %xmm7, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm11, %xmm3
+; SSE41-NEXT: movdqa %xmm7, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm2
+; SSE41-NEXT: movapd %xmm8, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
; SSE41-NEXT: pxor %xmm4, %xmm9
-; SSE41-NEXT: movdqa %xmm11, %xmm3
+; SSE41-NEXT: movdqa %xmm7, %xmm3
; SSE41-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm11, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7
-; SSE41-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2]
-; SSE41-NEXT: pmaxud %xmm7, %xmm1
-; SSE41-NEXT: psubd %xmm7, %xmm1
-; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
+; SSE41-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm2[0,2]
+; SSE41-NEXT: pmaxud %xmm8, %xmm1
+; SSE41-NEXT: psubd %xmm8, %xmm1
+; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test33:
@@ -2903,133 +2903,133 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) {
; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1]
; SSE2OR3-NEXT: pand %xmm6, %xmm1
; SSE2OR3-NEXT: pand %xmm6, %xmm0
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
-; SSE2OR3-NEXT: movdqa %xmm3, %xmm6
-; SSE2OR3-NEXT: pxor %xmm8, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm10
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
-; SSE2OR3-NEXT: movdqa %xmm9, %xmm7
-; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2OR3-NEXT: pand %xmm10, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2OR3-NEXT: por %xmm6, %xmm7
-; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE2OR3-NEXT: pand %xmm7, %xmm3
-; SSE2OR3-NEXT: pxor %xmm10, %xmm7
-; SSE2OR3-NEXT: por %xmm3, %xmm7
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE2OR3-NEXT: movdqa %xmm3, %xmm8
+; SSE2OR3-NEXT: pxor %xmm6, %xmm8
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455]
+; SSE2OR3-NEXT: movdqa %xmm7, %xmm10
+; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm10
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2]
+; SSE2OR3-NEXT: pand %xmm9, %xmm8
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2OR3-NEXT: por %xmm8, %xmm9
+; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE2OR3-NEXT: pand %xmm9, %xmm3
+; SSE2OR3-NEXT: pxor %xmm8, %xmm9
+; SSE2OR3-NEXT: por %xmm3, %xmm9
; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
-; SSE2OR3-NEXT: pxor %xmm8, %xmm3
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm11
-; SSE2OR3-NEXT: movdqa %xmm9, %xmm6
-; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; SSE2OR3-NEXT: pand %xmm11, %xmm3
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2OR3-NEXT: por %xmm3, %xmm6
-; SSE2OR3-NEXT: pand %xmm6, %xmm2
-; SSE2OR3-NEXT: pxor %xmm10, %xmm6
-; SSE2OR3-NEXT: por %xmm2, %xmm6
-; SSE2OR3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
+; SSE2OR3-NEXT: pxor %xmm6, %xmm3
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3]
+; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2OR3-NEXT: movdqa %xmm7, %xmm11
+; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm11
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2]
+; SSE2OR3-NEXT: pand %xmm10, %xmm3
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSE2OR3-NEXT: por %xmm3, %xmm10
+; SSE2OR3-NEXT: pand %xmm10, %xmm2
+; SSE2OR3-NEXT: pxor %xmm8, %xmm10
+; SSE2OR3-NEXT: por %xmm2, %xmm10
+; SSE2OR3-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,2]
; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
-; SSE2OR3-NEXT: psubd %xmm6, %xmm2
-; SSE2OR3-NEXT: pxor %xmm8, %xmm6
-; SSE2OR3-NEXT: por %xmm8, %xmm0
-; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE2OR3-NEXT: psubd %xmm10, %xmm2
+; SSE2OR3-NEXT: pxor %xmm6, %xmm10
+; SSE2OR3-NEXT: por %xmm6, %xmm0
+; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm0
; SSE2OR3-NEXT: pand %xmm2, %xmm0
; SSE2OR3-NEXT: movdqa %xmm5, %xmm2
-; SSE2OR3-NEXT: pxor %xmm8, %xmm2
+; SSE2OR3-NEXT: pxor %xmm6, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm3
-; SSE2OR3-NEXT: movdqa %xmm9, %xmm6
-; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
+; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm7, %xmm9
+; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm9
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2]
; SSE2OR3-NEXT: pand %xmm3, %xmm2
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
; SSE2OR3-NEXT: por %xmm2, %xmm3
; SSE2OR3-NEXT: pand %xmm3, %xmm5
-; SSE2OR3-NEXT: pxor %xmm10, %xmm3
+; SSE2OR3-NEXT: pxor %xmm8, %xmm3
; SSE2OR3-NEXT: por %xmm5, %xmm3
; SSE2OR3-NEXT: movdqa %xmm4, %xmm2
-; SSE2OR3-NEXT: pxor %xmm8, %xmm2
+; SSE2OR3-NEXT: pxor %xmm6, %xmm2
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm9
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2]
+; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm7
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
; SSE2OR3-NEXT: pand %xmm5, %xmm2
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2OR3-NEXT: por %xmm2, %xmm5
-; SSE2OR3-NEXT: pxor %xmm5, %xmm10
+; SSE2OR3-NEXT: pxor %xmm5, %xmm8
; SSE2OR3-NEXT: pand %xmm4, %xmm5
-; SSE2OR3-NEXT: por %xmm10, %xmm5
+; SSE2OR3-NEXT: por %xmm8, %xmm5
; SSE2OR3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2]
; SSE2OR3-NEXT: movdqa %xmm1, %xmm2
; SSE2OR3-NEXT: psubd %xmm5, %xmm2
-; SSE2OR3-NEXT: pxor %xmm8, %xmm5
-; SSE2OR3-NEXT: por %xmm8, %xmm1
+; SSE2OR3-NEXT: pxor %xmm6, %xmm5
+; SSE2OR3-NEXT: por %xmm6, %xmm1
; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm1
; SSE2OR3-NEXT: pand %xmm2, %xmm1
; SSE2OR3-NEXT: retq
;
; SSE41-LABEL: test34:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm0, %xmm6
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm8
+; SSE41-NEXT: pand %xmm0, %xmm6
; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT: movdqa %xmm11, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: movdqa %xmm11, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295]
-; SSE41-NEXT: movapd %xmm7, %xmm10
+; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455]
+; SSE41-NEXT: movdqa %xmm7, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm7, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE41-NEXT: movapd %xmm8, %xmm10
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm11, %xmm3
+; SSE41-NEXT: movdqa %xmm7, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm11, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm7, %xmm11
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm11
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm3
+; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: movapd %xmm8, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
-; SSE41-NEXT: pmaxud %xmm3, %xmm8
-; SSE41-NEXT: psubd %xmm3, %xmm8
+; SSE41-NEXT: pmaxud %xmm3, %xmm6
+; SSE41-NEXT: psubd %xmm3, %xmm6
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm11, %xmm2
+; SSE41-NEXT: movdqa %xmm7, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm11, %xmm3
+; SSE41-NEXT: movdqa %xmm7, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm2
+; SSE41-NEXT: movapd %xmm8, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
; SSE41-NEXT: pxor %xmm4, %xmm9
-; SSE41-NEXT: movdqa %xmm11, %xmm3
+; SSE41-NEXT: movdqa %xmm7, %xmm3
; SSE41-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm11, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7
-; SSE41-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2]
-; SSE41-NEXT: pmaxud %xmm7, %xmm1
-; SSE41-NEXT: psubd %xmm7, %xmm1
-; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
+; SSE41-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm2[0,2]
+; SSE41-NEXT: pmaxud %xmm8, %xmm1
+; SSE41-NEXT: psubd %xmm8, %xmm1
+; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test34:
diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
index 3b197a3627e62..fd9251323ca1e 100644
--- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -65,8 +65,8 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: je LBB0_54
; CHECK-NEXT: ## %bb.6: ## %SyTime.exit2720
-; CHECK-NEXT: movq %rdx, %rbx
-; CHECK-NEXT: movq %rdi, %rbp
+; CHECK-NEXT: movq %rdx, %r14
+; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: cmpq %rax, %rcx
@@ -76,10 +76,10 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
; CHECK-NEXT: movl $32, %esi
; CHECK-NEXT: callq _memset
; CHECK-NEXT: LBB0_8: ## %while.body.preheader
-; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: imulq $1040, %rbx, %rax ## imm = 0x410
+; CHECK-NEXT: imulq $1040, %r14, %rax ## imm = 0x410
; CHECK-NEXT: movq _syBuf at GOTPCREL(%rip), %rcx
-; CHECK-NEXT: leaq 8(%rcx,%rax), %rdx
+; CHECK-NEXT: leaq 8(%rcx,%rax), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl $1, %r15d
; CHECK-NEXT: movq _syCTRO at GOTPCREL(%rip), %rax
; CHECK-NEXT: movb $1, %cl
@@ -90,14 +90,13 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
; CHECK-NEXT: testb %cl, %cl
; CHECK-NEXT: jne LBB0_9
; CHECK-NEXT: ## %bb.10: ## %do.end
-; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: xorl %r14d, %r14d
-; CHECK-NEXT: testb %r14b, %r14b
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: testb %bl, %bl
; CHECK-NEXT: jne LBB0_11
; CHECK-NEXT: ## %bb.12: ## %while.body200.preheader
; CHECK-NEXT: xorl %r13d, %r13d
; CHECK-NEXT: leaq LJTI0_0(%rip), %rdx
-; CHECK-NEXT: leaq LJTI0_1(%rip), %rbx
+; CHECK-NEXT: leaq LJTI0_1(%rip), %r14
; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
; CHECK-NEXT: xorl %r12d, %r12d
; CHECK-NEXT: jmp LBB0_13
@@ -110,19 +109,19 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: decl %r15d
; CHECK-NEXT: testl %r15d, %r15d
-; CHECK-NEXT: movl %r14d, %r12d
+; CHECK-NEXT: movl %ebx, %r12d
; CHECK-NEXT: jle LBB0_21
; CHECK-NEXT: LBB0_13: ## %while.body200
; CHECK-NEXT: ## =>This Loop Header: Depth=1
; CHECK-NEXT: ## Child Loop BB0_28 Depth 2
; CHECK-NEXT: ## Child Loop BB0_37 Depth 2
-; CHECK-NEXT: leal -268(%r14), %eax
+; CHECK-NEXT: leal -268(%rbx), %eax
; CHECK-NEXT: cmpl $105, %eax
; CHECK-NEXT: ja LBB0_14
; CHECK-NEXT: ## %bb.55: ## %while.body200
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: movslq (%rbx,%rax,4), %rax
-; CHECK-NEXT: addq %rbx, %rax
+; CHECK-NEXT: movslq (%r14,%rax,4), %rax
+; CHECK-NEXT: addq %r14, %rax
; CHECK-NEXT: jmpq *%rax
; CHECK-NEXT: LBB0_25: ## %sw.bb474
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
@@ -164,7 +163,7 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_14: ## %while.body200
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: leal 1(%r14), %eax
+; CHECK-NEXT: leal 1(%rbx), %eax
; CHECK-NEXT: cmpl $21, %eax
; CHECK-NEXT: ja LBB0_20
; CHECK-NEXT: ## %bb.15: ## %while.body200
@@ -174,7 +173,7 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
; CHECK-NEXT: jmpq *%rax
; CHECK-NEXT: LBB0_18: ## %while.cond201.preheader
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: movl $1, %r14d
+; CHECK-NEXT: movl $1, %ebx
; CHECK-NEXT: jmp LBB0_20
; CHECK-NEXT: LBB0_44: ## %sw.bb1134
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
@@ -184,15 +183,15 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
; CHECK-NEXT: jb LBB0_54
; CHECK-NEXT: ## %bb.45: ## in Loop: Header=BB0_13 Depth=1
; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
-; CHECK-NEXT: movl $268, %r14d ## imm = 0x10C
+; CHECK-NEXT: movl $268, %ebx ## imm = 0x10C
; CHECK-NEXT: jmp LBB0_20
; CHECK-NEXT: LBB0_39: ## %sw.bb566
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: movl $20, %r14d
+; CHECK-NEXT: movl $20, %ebx
; CHECK-NEXT: jmp LBB0_20
; CHECK-NEXT: LBB0_19: ## %sw.bb243
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT: movl $2, %r14d
+; CHECK-NEXT: movl $2, %ebx
; CHECK-NEXT: jmp LBB0_20
; CHECK-NEXT: LBB0_32: ## %if.end517.loopexitsplit
; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1
@@ -246,30 +245,30 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
; CHECK-NEXT: LBB0_11:
; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
; CHECK-NEXT: LBB0_21: ## %while.end1465
-; CHECK-NEXT: incl %r14d
-; CHECK-NEXT: cmpl $16, %r14d
+; CHECK-NEXT: incl %ebx
+; CHECK-NEXT: cmpl $16, %ebx
; CHECK-NEXT: ja LBB0_49
; CHECK-NEXT: ## %bb.22: ## %while.end1465
; CHECK-NEXT: movl $83969, %eax ## imm = 0x14801
-; CHECK-NEXT: btl %r14d, %eax
+; CHECK-NEXT: btl %ebx, %eax
; CHECK-NEXT: jae LBB0_49
; CHECK-NEXT: ## %bb.23:
-; CHECK-NEXT: xorl %ebp, %ebp
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
; CHECK-NEXT: LBB0_47: ## %if.then1477
; CHECK-NEXT: movl $1, %edx
; CHECK-NEXT: callq _write
-; CHECK-NEXT: subq %rbp, %rbx
+; CHECK-NEXT: subq %rbx, %r14
; CHECK-NEXT: movq _syHistory at GOTPCREL(%rip), %rax
-; CHECK-NEXT: leaq 8189(%rbx,%rax), %rax
+; CHECK-NEXT: leaq 8189(%r14,%rax), %rax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_48: ## %for.body1723
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: decq %rax
; CHECK-NEXT: jmp LBB0_48
; CHECK-NEXT: LBB0_46: ## %if.then1477.loopexit
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
-; CHECK-NEXT: movq %rbx, %rbp
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; CHECK-NEXT: movq %r14, %rbx
; CHECK-NEXT: jmp LBB0_47
; CHECK-NEXT: LBB0_16: ## %while.cond635.preheader
; CHECK-NEXT: xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/reverse_branches.ll b/llvm/test/CodeGen/X86/reverse_branches.ll
index 7d972b476cebb..d874a47356e3c 100644
--- a/llvm/test/CodeGen/X86/reverse_branches.ll
+++ b/llvm/test/CodeGen/X86/reverse_branches.ll
@@ -33,24 +33,24 @@ define i32 @test_branches_order() uwtable ssp {
; CHECK-NEXT: movq ___stack_chk_guard at GOTPCREL(%rip), %rax
; CHECK-NEXT: movq (%rax), %rax
; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: xorl %r12d, %r12d
+; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %r14
; CHECK-NEXT: movq %rsp, %r15
; CHECK-NEXT: jmp LBB0_1
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_6: ## %for.inc9
; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: incl %r12d
+; CHECK-NEXT: incl %ebx
; CHECK-NEXT: LBB0_1: ## %for.cond
; CHECK-NEXT: ## =>This Loop Header: Depth=1
; CHECK-NEXT: ## Child Loop BB0_3 Depth 2
-; CHECK-NEXT: cmpl $999, %r12d ## imm = 0x3E7
+; CHECK-NEXT: cmpl $999, %ebx ## imm = 0x3E7
; CHECK-NEXT: jg LBB0_7
; CHECK-NEXT: ## %bb.2: ## %for.cond1.preheader
; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: movl $-1, %ebp
; CHECK-NEXT: movq %r15, %rdi
-; CHECK-NEXT: movq %r14, %rbx
+; CHECK-NEXT: movq %r14, %r12
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_3: ## %for.cond1
; CHECK-NEXT: ## Parent Loop BB0_1 Depth=1
@@ -60,12 +60,12 @@ define i32 @test_branches_order() uwtable ssp {
; CHECK-NEXT: jg LBB0_6
; CHECK-NEXT: ## %bb.4: ## %for.body3
; CHECK-NEXT: ## in Loop: Header=BB0_3 Depth=2
-; CHECK-NEXT: addq $1002, %rbx ## imm = 0x3EA
+; CHECK-NEXT: addq $1002, %r12 ## imm = 0x3EA
; CHECK-NEXT: leaq 1001(%rdi), %r13
; CHECK-NEXT: movl $1000, %edx ## imm = 0x3E8
; CHECK-NEXT: movl $120, %esi
; CHECK-NEXT: callq _memchr
-; CHECK-NEXT: cmpq %rax, %rbx
+; CHECK-NEXT: cmpq %rax, %r12
; CHECK-NEXT: movq %r13, %rdi
; CHECK-NEXT: je LBB0_3
; CHECK-NEXT: jmp LBB0_5
@@ -94,11 +94,11 @@ define i32 @test_branches_order() uwtable ssp {
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_14: ## %exit
; CHECK-NEXT: ## in Loop: Header=BB0_10 Depth=2
-; CHECK-NEXT: addq %rsi, %rbp
+; CHECK-NEXT: addq %rsi, %r8
; CHECK-NEXT: incq %rdi
; CHECK-NEXT: decq %rsi
; CHECK-NEXT: addq $1001, %rdx ## imm = 0x3E9
-; CHECK-NEXT: cmpq $-1000, %rbp ## imm = 0xFC18
+; CHECK-NEXT: cmpq $-1000, %r8 ## imm = 0xFC18
; CHECK-NEXT: jne LBB0_5
; CHECK-NEXT: LBB0_10: ## %for.cond18
; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1
@@ -108,17 +108,17 @@ define i32 @test_branches_order() uwtable ssp {
; CHECK-NEXT: jg LBB0_15
; CHECK-NEXT: ## %bb.11: ## %for.body20
; CHECK-NEXT: ## in Loop: Header=BB0_10 Depth=2
-; CHECK-NEXT: movq $-1000, %rbp ## imm = 0xFC18
+; CHECK-NEXT: movq $-1000, %r8 ## imm = 0xFC18
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_12: ## %do.body.i
; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1
; CHECK-NEXT: ## Parent Loop BB0_10 Depth=2
; CHECK-NEXT: ## => This Inner Loop Header: Depth=3
-; CHECK-NEXT: cmpb $120, 1000(%rdx,%rbp)
+; CHECK-NEXT: cmpb $120, 1000(%rdx,%r8)
; CHECK-NEXT: je LBB0_14
; CHECK-NEXT: ## %bb.13: ## %do.cond.i
; CHECK-NEXT: ## in Loop: Header=BB0_12 Depth=3
-; CHECK-NEXT: incq %rbp
+; CHECK-NEXT: incq %r8
; CHECK-NEXT: jne LBB0_12
; CHECK-NEXT: LBB0_5: ## %if.then
; CHECK-NEXT: leaq L_str4(%rip), %rdi
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 36d23e7c96d25..1038665ccadac 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -355,7 +355,7 @@ define dso_local i32 @sad_avx64i8() nounwind {
;
; AVX1-LABEL: sad_avx64i8:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -382,15 +382,15 @@ define dso_local i32 @sad_avx64i8() nounwind {
; AVX1-NEXT: jne .LBB2_1
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm7
-; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm7
+; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm8
+; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm8
+; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm1
-; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index 11d86dd72c561..4abb2307a4a65 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -1007,46 +1007,46 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
;
; SSE41-LABEL: v16i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm3, %xmm8
-; SSE41-NEXT: movdqa %xmm2, %xmm10
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: paddd %xmm4, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm11
+; SSE41-NEXT: movdqa %xmm2, %xmm8
+; SSE41-NEXT: movdqa %xmm1, %xmm9
+; SSE41-NEXT: movdqa %xmm0, %xmm10
+; SSE41-NEXT: paddd %xmm4, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: movdqa %xmm9, %xmm1
+; SSE41-NEXT: movdqa %xmm10, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm9
-; SSE41-NEXT: movdqa %xmm3, %xmm1
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm10
+; SSE41-NEXT: movdqa %xmm9, %xmm1
; SSE41-NEXT: paddd %xmm5, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT: pxor %xmm5, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm9
+; SSE41-NEXT: pxor %xmm5, %xmm9
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm10, %xmm2
+; SSE41-NEXT: movdqa %xmm8, %xmm2
; SSE41-NEXT: paddd %xmm6, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm10
-; SSE41-NEXT: pxor %xmm6, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm8
+; SSE41-NEXT: pxor %xmm6, %xmm8
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: psrad $31, %xmm3
; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm10, %xmm0
+; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm11, %xmm3
; SSE41-NEXT: paddd %xmm7, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm8
-; SSE41-NEXT: pxor %xmm7, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm11
+; SSE41-NEXT: pxor %xmm7, %xmm11
; SSE41-NEXT: movdqa %xmm3, %xmm5
; SSE41-NEXT: psrad $31, %xmm5
; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: movdqa %xmm11, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3
-; SSE41-NEXT: movaps %xmm9, %xmm0
+; SSE41-NEXT: movaps %xmm10, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v16i32:
@@ -1267,8 +1267,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE2-NEXT: pandn %xmm0, %xmm6
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: pxor %xmm5, %xmm0
; SSE2-NEXT: pand %xmm7, %xmm0
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm6
@@ -1277,20 +1277,20 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm7
; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm2
+; SSE2-NEXT: pxor %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: pxor %xmm5, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: retq
@@ -1320,8 +1320,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSSE3-NEXT: pandn %xmm0, %xmm6
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: pxor %xmm8, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: pxor %xmm5, %xmm0
; SSSE3-NEXT: pand %xmm7, %xmm0
; SSSE3-NEXT: por %xmm6, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm6
@@ -1330,20 +1330,20 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSSE3-NEXT: pxor %xmm1, %xmm4
; SSSE3-NEXT: movdqa %xmm6, %xmm7
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm8, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm5, %xmm2
+; SSSE3-NEXT: pxor %xmm6, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pandn %xmm1, %xmm3
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm8, %xmm1
+; SSSE3-NEXT: pxor %xmm5, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm1
; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: retq
@@ -1363,11 +1363,11 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE41-NEXT: pand %xmm7, %xmm5
; SSE41-NEXT: por %xmm0, %xmm5
; SSE41-NEXT: pxor %xmm2, %xmm5
-; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movapd %xmm7, %xmm2
+; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT: movapd %xmm8, %xmm2
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -1382,9 +1382,9 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE41-NEXT: por %xmm0, %xmm2
; SSE41-NEXT: pxor %xmm3, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
; SSE41-NEXT: movapd %xmm4, %xmm0
; SSE41-NEXT: retq
;
@@ -1463,88 +1463,88 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
; SSE2-NEXT: por %xmm9, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pxor %xmm11, %xmm11
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
+; SSE2-NEXT: pxor %xmm11, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm11
+; SSE2-NEXT: pxor %xmm10, %xmm11
+; SSE2-NEXT: movdqa %xmm11, %xmm10
; SSE2-NEXT: pandn %xmm0, %xmm10
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm11, %xmm0
; SSE2-NEXT: por %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm10
; SSE2-NEXT: pxor %xmm8, %xmm10
; SSE2-NEXT: paddq %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: pxor %xmm8, %xmm11
; SSE2-NEXT: movdqa %xmm10, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm11, %xmm12
; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSE2-NEXT: pand %xmm13, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm10
+; SSE2-NEXT: pxor %xmm11, %xmm10
+; SSE2-NEXT: movdqa %xmm10, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm5
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: paddq %xmm6, %xmm2
+; SSE2-NEXT: pand %xmm10, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: paddq %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm10
+; SSE2-NEXT: pxor %xmm8, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
; SSE2-NEXT: pand %xmm12, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm10, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm5
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm9, %xmm2
; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
; SSE2-NEXT: paddq %xmm7, %xmm3
; SSE2-NEXT: pxor %xmm3, %xmm8
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm11
-; SSE2-NEXT: pxor %xmm5, %xmm11
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm5
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pxor %xmm9, %xmm3
-; SSE2-NEXT: pand %xmm11, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i64:
@@ -1564,88 +1564,88 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
; SSSE3-NEXT: por %xmm9, %xmm10
; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm11, %xmm11
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm10
+; SSSE3-NEXT: pxor %xmm11, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11
+; SSSE3-NEXT: pxor %xmm10, %xmm11
+; SSSE3-NEXT: movdqa %xmm11, %xmm10
; SSSE3-NEXT: pandn %xmm0, %xmm10
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
; SSSE3-NEXT: pxor %xmm9, %xmm0
-; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pand %xmm11, %xmm0
; SSSE3-NEXT: por %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm10
; SSSE3-NEXT: pxor %xmm8, %xmm10
; SSSE3-NEXT: paddq %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: movdqa %xmm1, %xmm11
+; SSSE3-NEXT: pxor %xmm8, %xmm11
; SSSE3-NEXT: movdqa %xmm10, %xmm12
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12
+; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12
; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm10
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pandn %xmm1, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm10, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSSE3-NEXT: pand %xmm13, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm10, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10
+; SSSE3-NEXT: pxor %xmm11, %xmm10
+; SSSE3-NEXT: movdqa %xmm10, %xmm5
+; SSSE3-NEXT: pandn %xmm1, %xmm5
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm10
-; SSSE3-NEXT: pxor %xmm8, %xmm10
-; SSSE3-NEXT: paddq %xmm6, %xmm2
+; SSSE3-NEXT: pand %xmm10, %xmm1
+; SSSE3-NEXT: por %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm5
; SSSE3-NEXT: pxor %xmm8, %xmm5
-; SSSE3-NEXT: movdqa %xmm10, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: paddq %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm10
+; SSSE3-NEXT: pxor %xmm8, %xmm10
+; SSSE3-NEXT: movdqa %xmm5, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
; SSSE3-NEXT: pand %xmm12, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm10
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSSE3-NEXT: pxor %xmm6, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm6
-; SSSE3-NEXT: movdqa %xmm6, %xmm4
-; SSSE3-NEXT: pandn %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm10, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm5
+; SSSE3-NEXT: pandn %xmm2, %xmm5
; SSSE3-NEXT: psrad $31, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pxor %xmm9, %xmm2
; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: por %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm8, %xmm5
; SSSE3-NEXT: paddq %xmm7, %xmm3
; SSSE3-NEXT: pxor %xmm3, %xmm8
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSSE3-NEXT: pand %xmm10, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT: pxor %xmm6, %xmm4
; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm8
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11
-; SSSE3-NEXT: pxor %xmm5, %xmm11
-; SSSE3-NEXT: movdqa %xmm11, %xmm4
-; SSSE3-NEXT: pandn %xmm3, %xmm4
+; SSSE3-NEXT: pandn %xmm3, %xmm5
; SSSE3-NEXT: psrad $31, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: pxor %xmm9, %xmm3
-; SSSE3-NEXT: pand %xmm11, %xmm3
-; SSSE3-NEXT: por %xmm4, %xmm3
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm5, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i64:
@@ -1792,66 +1792,62 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
; SSE-LABEL: v2i128:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbx
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rcx
; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT: seto %r10b
-; SSE-NEXT: movq %r8, %rbx
-; SSE-NEXT: sarq $63, %rbx
-; SSE-NEXT: testb %r10b, %r10b
-; SSE-NEXT: cmovneq %rbx, %rcx
+; SSE-NEXT: seto %dil
+; SSE-NEXT: movq %r8, %r10
+; SSE-NEXT: sarq $63, %r10
+; SSE-NEXT: testb %dil, %dil
+; SSE-NEXT: cmovneq %r10, %rcx
; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
-; SSE-NEXT: xorq %r11, %rbx
-; SSE-NEXT: testb %r10b, %r10b
-; SSE-NEXT: cmoveq %r8, %rbx
+; SSE-NEXT: xorq %r11, %r10
+; SSE-NEXT: testb %dil, %dil
+; SSE-NEXT: cmoveq %r8, %r10
; SSE-NEXT: addq %r9, %rsi
; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT: seto %r8b
-; SSE-NEXT: movq %rdx, %rdi
-; SSE-NEXT: sarq $63, %rdi
-; SSE-NEXT: testb %r8b, %r8b
-; SSE-NEXT: cmovneq %rdi, %rsi
-; SSE-NEXT: xorq %r11, %rdi
-; SSE-NEXT: testb %r8b, %r8b
-; SSE-NEXT: cmoveq %rdx, %rdi
+; SSE-NEXT: seto %dil
+; SSE-NEXT: movq %rdx, %r8
+; SSE-NEXT: sarq $63, %r8
+; SSE-NEXT: testb %dil, %dil
+; SSE-NEXT: cmovneq %r8, %rsi
+; SSE-NEXT: xorq %r11, %r8
+; SSE-NEXT: testb %dil, %dil
+; SSE-NEXT: cmoveq %rdx, %r8
; SSE-NEXT: movq %rcx, 16(%rax)
; SSE-NEXT: movq %rsi, (%rax)
-; SSE-NEXT: movq %rbx, 24(%rax)
-; SSE-NEXT: movq %rdi, 8(%rax)
-; SSE-NEXT: popq %rbx
+; SSE-NEXT: movq %r10, 24(%rax)
+; SSE-NEXT: movq %r8, 8(%rax)
; SSE-NEXT: retq
;
; AVX-LABEL: v2i128:
; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbx
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rcx
; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %r8
-; AVX-NEXT: seto %r10b
-; AVX-NEXT: movq %r8, %rbx
-; AVX-NEXT: sarq $63, %rbx
-; AVX-NEXT: testb %r10b, %r10b
-; AVX-NEXT: cmovneq %rbx, %rcx
+; AVX-NEXT: seto %dil
+; AVX-NEXT: movq %r8, %r10
+; AVX-NEXT: sarq $63, %r10
+; AVX-NEXT: testb %dil, %dil
+; AVX-NEXT: cmovneq %r10, %rcx
; AVX-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
-; AVX-NEXT: xorq %r11, %rbx
-; AVX-NEXT: testb %r10b, %r10b
-; AVX-NEXT: cmoveq %r8, %rbx
+; AVX-NEXT: xorq %r11, %r10
+; AVX-NEXT: testb %dil, %dil
+; AVX-NEXT: cmoveq %r8, %r10
; AVX-NEXT: addq %r9, %rsi
; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
-; AVX-NEXT: seto %r8b
-; AVX-NEXT: movq %rdx, %rdi
-; AVX-NEXT: sarq $63, %rdi
-; AVX-NEXT: testb %r8b, %r8b
-; AVX-NEXT: cmovneq %rdi, %rsi
-; AVX-NEXT: xorq %r11, %rdi
-; AVX-NEXT: testb %r8b, %r8b
-; AVX-NEXT: cmoveq %rdx, %rdi
+; AVX-NEXT: seto %dil
+; AVX-NEXT: movq %rdx, %r8
+; AVX-NEXT: sarq $63, %r8
+; AVX-NEXT: testb %dil, %dil
+; AVX-NEXT: cmovneq %r8, %rsi
+; AVX-NEXT: xorq %r11, %r8
+; AVX-NEXT: testb %dil, %dil
+; AVX-NEXT: cmoveq %rdx, %r8
; AVX-NEXT: movq %rcx, 16(%rax)
; AVX-NEXT: movq %rsi, (%rax)
-; AVX-NEXT: movq %rbx, 24(%rax)
-; AVX-NEXT: movq %rdi, 8(%rax)
-; AVX-NEXT: popq %rbx
+; AVX-NEXT: movq %r10, 24(%rax)
+; AVX-NEXT: movq %r8, 8(%rax)
; AVX-NEXT: retq
%z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
ret <2 x i128> %z
diff --git a/llvm/test/CodeGen/X86/sbb-false-dep.ll b/llvm/test/CodeGen/X86/sbb-false-dep.ll
index 77e5cb67e5e12..34a92cb58692b 100644
--- a/llvm/test/CodeGen/X86/sbb-false-dep.ll
+++ b/llvm/test/CodeGen/X86/sbb-false-dep.ll
@@ -10,11 +10,9 @@ define i32 @mallocbench_gs(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 n
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: pushq %r13
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: movl %r8d, %r13d
+; CHECK-NEXT: movl %r8d, %ebp
; CHECK-NEXT: movl %ecx, %r14d
; CHECK-NEXT: movl %edx, %r15d
; CHECK-NEXT: movq %rsi, %rbx
@@ -24,16 +22,16 @@ define i32 @mallocbench_gs(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 n
; CHECK-NEXT: movq %rbx, %rdx
; CHECK-NEXT: callq foo1 at PLT
; CHECK-NEXT: movq 8(%rbx), %rax
-; CHECK-NEXT: movq (%rax), %rdx
-; CHECK-NEXT: xorl %ebp, %ebp
-; CHECK-NEXT: movl %r13d, %ecx
+; CHECK-NEXT: movq (%rax), %rax
+; CHECK-NEXT: xorl %r10d, %r10d
+; CHECK-NEXT: movl %ebp, %ecx
; CHECK-NEXT: negl %ecx
-; CHECK-NEXT: movl $0, %eax
-; CHECK-NEXT: sbbq %rax, %rax
-; CHECK-NEXT: orq %rdx, %rax
-; CHECK-NEXT: cmpl $1, %r13d
-; CHECK-NEXT: sbbq %rbp, %rbp
-; CHECK-NEXT: orq %rdx, %rbp
+; CHECK-NEXT: movl $0, %r11d
+; CHECK-NEXT: sbbq %r11, %r11
+; CHECK-NEXT: orq %rax, %r11
+; CHECK-NEXT: cmpl $1, %ebp
+; CHECK-NEXT: sbbq %r10, %r10
+; CHECK-NEXT: orq %rax, %r10
; CHECK-NEXT: subq $8, %rsp
; CHECK-NEXT: movq %r12, %rdi
; CHECK-NEXT: movl %r15d, %esi
@@ -41,14 +39,13 @@ define i32 @mallocbench_gs(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 n
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: xorl %r8d, %r8d
; CHECK-NEXT: xorl %r9d, %r9d
-; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: pushq %r10
+; CHECK-NEXT: pushq %r11
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: callq foo2 at PLT
-; CHECK-NEXT: addq $40, %rsp
+; CHECK-NEXT: addq $32, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
-; CHECK-NEXT: popq %r13
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
@@ -59,11 +56,9 @@ define i32 @mallocbench_gs(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 n
; IDIOM-NEXT: pushq %rbp
; IDIOM-NEXT: pushq %r15
; IDIOM-NEXT: pushq %r14
-; IDIOM-NEXT: pushq %r13
; IDIOM-NEXT: pushq %r12
; IDIOM-NEXT: pushq %rbx
-; IDIOM-NEXT: pushq %rax
-; IDIOM-NEXT: movl %r8d, %r13d
+; IDIOM-NEXT: movl %r8d, %ebp
; IDIOM-NEXT: movl %ecx, %r14d
; IDIOM-NEXT: movl %edx, %r15d
; IDIOM-NEXT: movq %rsi, %rbx
@@ -73,14 +68,14 @@ define i32 @mallocbench_gs(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 n
; IDIOM-NEXT: movq %rbx, %rdx
; IDIOM-NEXT: callq foo1 at PLT
; IDIOM-NEXT: movq 8(%rbx), %rax
-; IDIOM-NEXT: movq (%rax), %rdx
-; IDIOM-NEXT: movl %r13d, %ecx
+; IDIOM-NEXT: movq (%rax), %rax
+; IDIOM-NEXT: movl %ebp, %ecx
; IDIOM-NEXT: negl %ecx
-; IDIOM-NEXT: sbbq %rbp, %rbp
-; IDIOM-NEXT: orq %rdx, %rbp
-; IDIOM-NEXT: cmpl $1, %r13d
-; IDIOM-NEXT: sbbq %rax, %rax
-; IDIOM-NEXT: orq %rdx, %rax
+; IDIOM-NEXT: sbbq %r10, %r10
+; IDIOM-NEXT: orq %rax, %r10
+; IDIOM-NEXT: cmpl $1, %ebp
+; IDIOM-NEXT: sbbq %r11, %r11
+; IDIOM-NEXT: orq %rax, %r11
; IDIOM-NEXT: subq $8, %rsp
; IDIOM-NEXT: movq %r12, %rdi
; IDIOM-NEXT: movl %r15d, %esi
@@ -88,14 +83,13 @@ define i32 @mallocbench_gs(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 n
; IDIOM-NEXT: xorl %ecx, %ecx
; IDIOM-NEXT: xorl %r8d, %r8d
; IDIOM-NEXT: xorl %r9d, %r9d
-; IDIOM-NEXT: pushq %rax
-; IDIOM-NEXT: pushq %rbp
+; IDIOM-NEXT: pushq %r11
+; IDIOM-NEXT: pushq %r10
; IDIOM-NEXT: pushq %rbx
; IDIOM-NEXT: callq foo2 at PLT
-; IDIOM-NEXT: addq $40, %rsp
+; IDIOM-NEXT: addq $32, %rsp
; IDIOM-NEXT: popq %rbx
; IDIOM-NEXT: popq %r12
-; IDIOM-NEXT: popq %r13
; IDIOM-NEXT: popq %r14
; IDIOM-NEXT: popq %r15
; IDIOM-NEXT: popq %rbp
diff --git a/llvm/test/CodeGen/X86/scalar_widen_div.ll b/llvm/test/CodeGen/X86/scalar_widen_div.ll
index 056056e388c3d..23e3d0c5c4321 100644
--- a/llvm/test/CodeGen/X86/scalar_widen_div.ll
+++ b/llvm/test/CodeGen/X86/scalar_widen_div.ll
@@ -7,27 +7,27 @@
define void @vectorDiv (ptr addrspace(1) %nsource, ptr addrspace(1) %dsource, ptr addrspace(1) %qdest) nounwind {
; CHECK-LABEL: vectorDiv:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq %rdx, %r8
+; CHECK-NEXT: movq %rdx, %rcx
; CHECK-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT: movq (%rdi,%rcx,8), %rdi
-; CHECK-NEXT: movq (%rsi,%rcx,8), %r10
+; CHECK-NEXT: movslq -{{[0-9]+}}(%rsp), %r8
+; CHECK-NEXT: movq (%rdi,%r8,8), %rdi
+; CHECK-NEXT: movq (%rsi,%r8,8), %r9
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: movq %r10, %rsi
+; CHECK-NEXT: movq %r9, %rsi
; CHECK-NEXT: shrq $32, %rsi
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %esi
-; CHECK-NEXT: movl %eax, %r9d
+; CHECK-NEXT: movl %eax, %esi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: cltd
-; CHECK-NEXT: idivl %r10d
+; CHECK-NEXT: idivl %r9d
; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: pinsrd $1, %r9d, %xmm0
-; CHECK-NEXT: movq %xmm0, (%r8,%rcx,8)
+; CHECK-NEXT: pinsrd $1, %esi, %xmm0
+; CHECK-NEXT: movq %xmm0, (%rcx,%r8,8)
; CHECK-NEXT: retq
entry:
%nsource.addr = alloca ptr addrspace(1), align 4
@@ -58,15 +58,16 @@ define <3 x i8> @test_char_div(<3 x i8> %num, <3 x i8> %div) {
; CHECK: # %bb.0:
; CHECK-NEXT: movsbl %dil, %eax
; CHECK-NEXT: idivb %cl
-; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: movsbl %sil, %eax
; CHECK-NEXT: idivb %r8b
; CHECK-NEXT: movl %eax, %esi
; CHECK-NEXT: movsbl %dl, %eax
; CHECK-NEXT: idivb %r9b
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: movl %esi, %edx
+; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: retq
%div.r = sdiv <3 x i8> %num, %div
ret <3 x i8> %div.r
@@ -77,15 +78,16 @@ define <3 x i8> @test_uchar_div(<3 x i8> %num, <3 x i8> %div) {
; CHECK: # %bb.0:
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: divb %cl
-; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: movzbl %sil, %eax
; CHECK-NEXT: divb %r8b
; CHECK-NEXT: movl %eax, %esi
; CHECK-NEXT: movzbl %dl, %eax
; CHECK-NEXT: divb %r9b
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: movl %esi, %edx
+; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: retq
%div.r = udiv <3 x i8> %num, %div
ret <3 x i8> %div.r
@@ -99,36 +101,36 @@ define <5 x i16> @test_short_div(<5 x i16> %num, <5 x i16> %div) {
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: cwtd
; CHECK-NEXT: idivw %cx
-; CHECK-NEXT: movl %eax, %r8d
+; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: pextrw $3, %xmm0, %eax
-; CHECK-NEXT: pextrw $3, %xmm1, %ecx
+; CHECK-NEXT: pextrw $3, %xmm1, %esi
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: cwtd
-; CHECK-NEXT: idivw %cx
-; CHECK-NEXT: movl %eax, %r9d
+; CHECK-NEXT: idivw %si
+; CHECK-NEXT: movl %eax, %esi
; CHECK-NEXT: pextrw $2, %xmm0, %eax
-; CHECK-NEXT: pextrw $2, %xmm1, %ecx
+; CHECK-NEXT: pextrw $2, %xmm1, %edi
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: cwtd
-; CHECK-NEXT: idivw %cx
+; CHECK-NEXT: idivw %di
; CHECK-NEXT: movl %eax, %edi
; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: movd %xmm1, %ecx
+; CHECK-NEXT: movd %xmm1, %r8d
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: cwtd
-; CHECK-NEXT: idivw %cx
-; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: idivw %r8w
+; CHECK-NEXT: movl %eax, %r8d
; CHECK-NEXT: pextrw $1, %xmm0, %eax
-; CHECK-NEXT: pextrw $1, %xmm1, %esi
+; CHECK-NEXT: pextrw $1, %xmm1, %r9d
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: cwtd
-; CHECK-NEXT: idivw %si
+; CHECK-NEXT: idivw %r9w
; CHECK-NEXT: # kill: def $ax killed $ax def $eax
-; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: movd %r8d, %xmm0
; CHECK-NEXT: pinsrw $1, %eax, %xmm0
; CHECK-NEXT: pinsrw $2, %edi, %xmm0
-; CHECK-NEXT: pinsrw $3, %r9d, %xmm0
-; CHECK-NEXT: pinsrw $4, %r8d, %xmm0
+; CHECK-NEXT: pinsrw $3, %esi, %xmm0
+; CHECK-NEXT: pinsrw $4, %ecx, %xmm0
; CHECK-NEXT: retq
%div.r = sdiv <5 x i16> %num, %div
ret <5 x i16> %div.r
@@ -285,36 +287,36 @@ define <5 x i16> @test_short_rem(<5 x i16> %num, <5 x i16> %rem) {
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: cwtd
; CHECK-NEXT: idivw %cx
-; CHECK-NEXT: movl %edx, %r8d
+; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: pextrw $3, %xmm0, %eax
-; CHECK-NEXT: pextrw $3, %xmm1, %ecx
+; CHECK-NEXT: pextrw $3, %xmm1, %esi
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: cwtd
-; CHECK-NEXT: idivw %cx
-; CHECK-NEXT: movl %edx, %r9d
+; CHECK-NEXT: idivw %si
+; CHECK-NEXT: movl %edx, %esi
; CHECK-NEXT: pextrw $2, %xmm0, %eax
-; CHECK-NEXT: pextrw $2, %xmm1, %ecx
+; CHECK-NEXT: pextrw $2, %xmm1, %edi
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: cwtd
-; CHECK-NEXT: idivw %cx
+; CHECK-NEXT: idivw %di
; CHECK-NEXT: movl %edx, %edi
; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: movd %xmm1, %ecx
+; CHECK-NEXT: movd %xmm1, %r8d
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: cwtd
-; CHECK-NEXT: idivw %cx
-; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: idivw %r8w
+; CHECK-NEXT: movl %edx, %r8d
; CHECK-NEXT: pextrw $1, %xmm0, %eax
-; CHECK-NEXT: pextrw $1, %xmm1, %esi
+; CHECK-NEXT: pextrw $1, %xmm1, %r9d
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: cwtd
-; CHECK-NEXT: idivw %si
+; CHECK-NEXT: idivw %r9w
; CHECK-NEXT: # kill: def $dx killed $dx def $edx
-; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: movd %r8d, %xmm0
; CHECK-NEXT: pinsrw $1, %edx, %xmm0
; CHECK-NEXT: pinsrw $2, %edi, %xmm0
-; CHECK-NEXT: pinsrw $3, %r9d, %xmm0
-; CHECK-NEXT: pinsrw $4, %r8d, %xmm0
+; CHECK-NEXT: pinsrw $3, %esi, %xmm0
+; CHECK-NEXT: pinsrw $4, %ecx, %xmm0
; CHECK-NEXT: retq
%rem.r = srem <5 x i16> %num, %rem
ret <5 x i16> %rem.r
@@ -390,28 +392,28 @@ define void @test_int_div(ptr %dest, ptr %old, i32 %n) {
; CHECK-NEXT: testl %edx, %edx
; CHECK-NEXT: jle .LBB12_3
; CHECK-NEXT: # %bb.1: # %bb.nph
-; CHECK-NEXT: movl %edx, %r10d
-; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: xorl %r10d, %r10d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB12_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl (%rdi,%rcx), %r8d
-; CHECK-NEXT: movl 4(%rdi,%rcx), %eax
+; CHECK-NEXT: movl (%rdi,%r10), %r8d
+; CHECK-NEXT: movl 4(%rdi,%r10), %eax
; CHECK-NEXT: cltd
-; CHECK-NEXT: idivl 4(%rsi,%rcx)
+; CHECK-NEXT: idivl 4(%rsi,%r10)
; CHECK-NEXT: movl %eax, %r9d
; CHECK-NEXT: movl %r8d, %eax
; CHECK-NEXT: cltd
-; CHECK-NEXT: idivl (%rsi,%rcx)
+; CHECK-NEXT: idivl (%rsi,%r10)
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: pinsrd $1, %r9d, %xmm0
-; CHECK-NEXT: movl 8(%rdi,%rcx), %eax
+; CHECK-NEXT: movl 8(%rdi,%r10), %eax
; CHECK-NEXT: cltd
-; CHECK-NEXT: idivl 8(%rsi,%rcx)
-; CHECK-NEXT: movl %eax, 8(%rdi,%rcx)
-; CHECK-NEXT: movq %xmm0, (%rdi,%rcx)
-; CHECK-NEXT: addq $16, %rcx
-; CHECK-NEXT: decl %r10d
+; CHECK-NEXT: idivl 8(%rsi,%r10)
+; CHECK-NEXT: movl %eax, 8(%rdi,%r10)
+; CHECK-NEXT: movq %xmm0, (%rdi,%r10)
+; CHECK-NEXT: addq $16, %r10
+; CHECK-NEXT: decl %ecx
; CHECK-NEXT: jne .LBB12_2
; CHECK-NEXT: .LBB12_3: # %for.end
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 8c3d25fe565f3..03ea229dc4dda 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -17,41 +17,41 @@ define i256 @test1(i256 %a) nounwind {
; ILP-NEXT: movq %rdi, %rax
; ILP-NEXT: xorl %r8d, %r8d
; ILP-NEXT: addl %esi, %esi
-; ILP-NEXT: leal 3(%rsi), %r9d
+; ILP-NEXT: leal 3(%rsi), %edx
+; ILP-NEXT: movl $1, %r9d
+; ILP-NEXT: xorl %r10d, %r10d
+; ILP-NEXT: movl %edx, %ecx
+; ILP-NEXT: shldq %cl, %r9, %r10
; ILP-NEXT: movl $1, %r11d
-; ILP-NEXT: xorl %r14d, %r14d
-; ILP-NEXT: movl %r9d, %ecx
-; ILP-NEXT: shldq %cl, %r11, %r14
-; ILP-NEXT: movl $1, %edx
-; ILP-NEXT: shlq %cl, %rdx
-; ILP-NEXT: leal -125(%rsi), %r10d
+; ILP-NEXT: shlq %cl, %r11
+; ILP-NEXT: leal -125(%rsi), %edi
; ILP-NEXT: xorl %ebx, %ebx
-; ILP-NEXT: movl %r10d, %ecx
-; ILP-NEXT: shldq %cl, %r11, %rbx
-; ILP-NEXT: testb $64, %r9b
-; ILP-NEXT: cmovneq %rdx, %r14
-; ILP-NEXT: cmovneq %r8, %rdx
-; ILP-NEXT: movl $1, %edi
-; ILP-NEXT: shlq %cl, %rdi
+; ILP-NEXT: movl %edi, %ecx
+; ILP-NEXT: shldq %cl, %r9, %rbx
+; ILP-NEXT: testb $64, %dl
+; ILP-NEXT: cmovneq %r11, %r10
+; ILP-NEXT: cmovneq %r8, %r11
+; ILP-NEXT: movl $1, %r14d
+; ILP-NEXT: shlq %cl, %r14
; ILP-NEXT: movb $125, %cl
; ILP-NEXT: subb %sil, %cl
-; ILP-NEXT: shrdq %cl, %r8, %r11
+; ILP-NEXT: shrdq %cl, %r8, %r9
; ILP-NEXT: testb $64, %cl
-; ILP-NEXT: cmovneq %r8, %r11
-; ILP-NEXT: testb $64, %r10b
-; ILP-NEXT: cmovneq %rdi, %rbx
-; ILP-NEXT: cmovneq %r8, %rdi
-; ILP-NEXT: testb %r9b, %r9b
-; ILP-NEXT: cmovsq %r8, %r14
-; ILP-NEXT: cmovsq %r8, %rdx
-; ILP-NEXT: movq %r14, 8(%rax)
-; ILP-NEXT: movq %rdx, (%rax)
+; ILP-NEXT: cmovneq %r8, %r9
+; ILP-NEXT: testb $64, %dil
+; ILP-NEXT: cmovneq %r14, %rbx
+; ILP-NEXT: cmovneq %r8, %r14
+; ILP-NEXT: testb %dl, %dl
+; ILP-NEXT: cmovsq %r8, %r10
+; ILP-NEXT: cmovsq %r8, %r11
+; ILP-NEXT: movq %r10, 8(%rax)
+; ILP-NEXT: movq %r11, (%rax)
; ILP-NEXT: cmovnsq %r8, %rbx
; ILP-NEXT: cmoveq %r8, %rbx
; ILP-NEXT: movq %rbx, 24(%rax)
-; ILP-NEXT: cmovnsq %r11, %rdi
-; ILP-NEXT: cmoveq %r8, %rdi
-; ILP-NEXT: movq %rdi, 16(%rax)
+; ILP-NEXT: cmovnsq %r9, %r14
+; ILP-NEXT: cmoveq %r8, %r14
+; ILP-NEXT: movq %r14, 16(%rax)
; ILP-NEXT: popq %rbx
; ILP-NEXT: popq %r14
; ILP-NEXT: retq
@@ -63,41 +63,41 @@ define i256 @test1(i256 %a) nounwind {
; HYBRID-NEXT: addl %esi, %esi
; HYBRID-NEXT: movb $125, %cl
; HYBRID-NEXT: subb %sil, %cl
-; HYBRID-NEXT: xorl %r8d, %r8d
-; HYBRID-NEXT: movl $1, %edi
+; HYBRID-NEXT: xorl %edi, %edi
; HYBRID-NEXT: movl $1, %r9d
-; HYBRID-NEXT: shrdq %cl, %r8, %r9
+; HYBRID-NEXT: movl $1, %r8d
+; HYBRID-NEXT: shrdq %cl, %rdi, %r8
; HYBRID-NEXT: testb $64, %cl
-; HYBRID-NEXT: cmovneq %r8, %r9
-; HYBRID-NEXT: leal 3(%rsi), %r10d
+; HYBRID-NEXT: cmovneq %rdi, %r8
+; HYBRID-NEXT: leal 3(%rsi), %edx
; HYBRID-NEXT: xorl %r11d, %r11d
-; HYBRID-NEXT: movl %r10d, %ecx
-; HYBRID-NEXT: shldq %cl, %rdi, %r11
+; HYBRID-NEXT: movl %edx, %ecx
+; HYBRID-NEXT: shldq %cl, %r9, %r11
; HYBRID-NEXT: addb $-125, %sil
; HYBRID-NEXT: xorl %ebx, %ebx
; HYBRID-NEXT: movl %esi, %ecx
-; HYBRID-NEXT: shldq %cl, %rdi, %rbx
-; HYBRID-NEXT: movl $1, %edx
-; HYBRID-NEXT: shlq %cl, %rdx
+; HYBRID-NEXT: shldq %cl, %r9, %rbx
+; HYBRID-NEXT: movl $1, %r10d
+; HYBRID-NEXT: shlq %cl, %r10
; HYBRID-NEXT: testb $64, %sil
-; HYBRID-NEXT: cmovneq %rdx, %rbx
-; HYBRID-NEXT: cmovneq %r8, %rdx
-; HYBRID-NEXT: movl %r10d, %ecx
-; HYBRID-NEXT: shlq %cl, %rdi
-; HYBRID-NEXT: testb $64, %r10b
-; HYBRID-NEXT: cmovneq %rdi, %r11
-; HYBRID-NEXT: cmovneq %r8, %rdi
-; HYBRID-NEXT: testb %r10b, %r10b
-; HYBRID-NEXT: cmovsq %r8, %r11
+; HYBRID-NEXT: cmovneq %r10, %rbx
+; HYBRID-NEXT: cmovneq %rdi, %r10
+; HYBRID-NEXT: movl %edx, %ecx
+; HYBRID-NEXT: shlq %cl, %r9
+; HYBRID-NEXT: testb $64, %dl
+; HYBRID-NEXT: cmovneq %r9, %r11
+; HYBRID-NEXT: cmovneq %rdi, %r9
+; HYBRID-NEXT: testb %dl, %dl
+; HYBRID-NEXT: cmovsq %rdi, %r11
; HYBRID-NEXT: movq %r11, 8(%rax)
-; HYBRID-NEXT: cmovsq %r8, %rdi
-; HYBRID-NEXT: movq %rdi, (%rax)
-; HYBRID-NEXT: cmovnsq %r8, %rbx
-; HYBRID-NEXT: cmoveq %r8, %rbx
+; HYBRID-NEXT: cmovsq %rdi, %r9
+; HYBRID-NEXT: movq %r9, (%rax)
+; HYBRID-NEXT: cmovnsq %rdi, %rbx
+; HYBRID-NEXT: cmoveq %rdi, %rbx
; HYBRID-NEXT: movq %rbx, 24(%rax)
-; HYBRID-NEXT: cmovnsq %r9, %rdx
-; HYBRID-NEXT: cmoveq %r8, %rdx
-; HYBRID-NEXT: movq %rdx, 16(%rax)
+; HYBRID-NEXT: cmovnsq %r8, %r10
+; HYBRID-NEXT: cmoveq %rdi, %r10
+; HYBRID-NEXT: movq %r10, 16(%rax)
; HYBRID-NEXT: popq %rbx
; HYBRID-NEXT: retq
;
@@ -108,41 +108,41 @@ define i256 @test1(i256 %a) nounwind {
; BURR-NEXT: addl %esi, %esi
; BURR-NEXT: movb $125, %cl
; BURR-NEXT: subb %sil, %cl
-; BURR-NEXT: xorl %r8d, %r8d
-; BURR-NEXT: movl $1, %edi
+; BURR-NEXT: xorl %edi, %edi
; BURR-NEXT: movl $1, %r9d
-; BURR-NEXT: shrdq %cl, %r8, %r9
+; BURR-NEXT: movl $1, %r8d
+; BURR-NEXT: shrdq %cl, %rdi, %r8
; BURR-NEXT: testb $64, %cl
-; BURR-NEXT: cmovneq %r8, %r9
-; BURR-NEXT: leal 3(%rsi), %r10d
+; BURR-NEXT: cmovneq %rdi, %r8
+; BURR-NEXT: leal 3(%rsi), %edx
; BURR-NEXT: xorl %r11d, %r11d
-; BURR-NEXT: movl %r10d, %ecx
-; BURR-NEXT: shldq %cl, %rdi, %r11
+; BURR-NEXT: movl %edx, %ecx
+; BURR-NEXT: shldq %cl, %r9, %r11
; BURR-NEXT: addb $-125, %sil
; BURR-NEXT: xorl %ebx, %ebx
; BURR-NEXT: movl %esi, %ecx
-; BURR-NEXT: shldq %cl, %rdi, %rbx
-; BURR-NEXT: movl $1, %edx
-; BURR-NEXT: shlq %cl, %rdx
+; BURR-NEXT: shldq %cl, %r9, %rbx
+; BURR-NEXT: movl $1, %r10d
+; BURR-NEXT: shlq %cl, %r10
; BURR-NEXT: testb $64, %sil
-; BURR-NEXT: cmovneq %rdx, %rbx
-; BURR-NEXT: cmovneq %r8, %rdx
-; BURR-NEXT: movl %r10d, %ecx
-; BURR-NEXT: shlq %cl, %rdi
-; BURR-NEXT: testb $64, %r10b
-; BURR-NEXT: cmovneq %rdi, %r11
-; BURR-NEXT: cmovneq %r8, %rdi
-; BURR-NEXT: testb %r10b, %r10b
-; BURR-NEXT: cmovsq %r8, %r11
+; BURR-NEXT: cmovneq %r10, %rbx
+; BURR-NEXT: cmovneq %rdi, %r10
+; BURR-NEXT: movl %edx, %ecx
+; BURR-NEXT: shlq %cl, %r9
+; BURR-NEXT: testb $64, %dl
+; BURR-NEXT: cmovneq %r9, %r11
+; BURR-NEXT: cmovneq %rdi, %r9
+; BURR-NEXT: testb %dl, %dl
+; BURR-NEXT: cmovsq %rdi, %r11
; BURR-NEXT: movq %r11, 8(%rax)
-; BURR-NEXT: cmovsq %r8, %rdi
-; BURR-NEXT: movq %rdi, (%rax)
-; BURR-NEXT: cmovnsq %r8, %rbx
-; BURR-NEXT: cmoveq %r8, %rbx
+; BURR-NEXT: cmovsq %rdi, %r9
+; BURR-NEXT: movq %r9, (%rax)
+; BURR-NEXT: cmovnsq %rdi, %rbx
+; BURR-NEXT: cmoveq %rdi, %rbx
; BURR-NEXT: movq %rbx, 24(%rax)
-; BURR-NEXT: cmovnsq %r9, %rdx
-; BURR-NEXT: cmoveq %r8, %rdx
-; BURR-NEXT: movq %rdx, 16(%rax)
+; BURR-NEXT: cmovnsq %r8, %r10
+; BURR-NEXT: cmoveq %rdi, %r10
+; BURR-NEXT: movq %r10, 16(%rax)
; BURR-NEXT: popq %rbx
; BURR-NEXT: retq
;
@@ -151,7 +151,7 @@ define i256 @test1(i256 %a) nounwind {
; SRC-NEXT: pushq %rbx
; SRC-NEXT: movq %rdi, %rax
; SRC-NEXT: addl %esi, %esi
-; SRC-NEXT: leal 3(%rsi), %r9d
+; SRC-NEXT: leal 3(%rsi), %edx
; SRC-NEXT: movb $125, %cl
; SRC-NEXT: subb %sil, %cl
; SRC-NEXT: xorl %r8d, %r8d
@@ -161,32 +161,32 @@ define i256 @test1(i256 %a) nounwind {
; SRC-NEXT: testb $64, %cl
; SRC-NEXT: cmovneq %r8, %r10
; SRC-NEXT: addb $-125, %sil
-; SRC-NEXT: xorl %edx, %edx
+; SRC-NEXT: xorl %r9d, %r9d
; SRC-NEXT: movl %esi, %ecx
-; SRC-NEXT: shldq %cl, %rdi, %rdx
+; SRC-NEXT: shldq %cl, %rdi, %r9
; SRC-NEXT: xorl %r11d, %r11d
-; SRC-NEXT: movl %r9d, %ecx
+; SRC-NEXT: movl %edx, %ecx
; SRC-NEXT: shldq %cl, %rdi, %r11
; SRC-NEXT: movl $1, %ebx
; SRC-NEXT: shlq %cl, %rbx
-; SRC-NEXT: testb $64, %r9b
+; SRC-NEXT: testb $64, %dl
; SRC-NEXT: cmovneq %rbx, %r11
; SRC-NEXT: cmovneq %r8, %rbx
; SRC-NEXT: movl %esi, %ecx
; SRC-NEXT: shlq %cl, %rdi
; SRC-NEXT: testb $64, %sil
-; SRC-NEXT: cmovneq %rdi, %rdx
+; SRC-NEXT: cmovneq %rdi, %r9
; SRC-NEXT: cmovneq %r8, %rdi
-; SRC-NEXT: testb %r9b, %r9b
+; SRC-NEXT: testb %dl, %dl
; SRC-NEXT: cmovnsq %r10, %rdi
; SRC-NEXT: cmoveq %r8, %rdi
-; SRC-NEXT: cmovnsq %r8, %rdx
-; SRC-NEXT: cmoveq %r8, %rdx
+; SRC-NEXT: cmovnsq %r8, %r9
+; SRC-NEXT: cmoveq %r8, %r9
; SRC-NEXT: cmovsq %r8, %r11
; SRC-NEXT: cmovsq %r8, %rbx
; SRC-NEXT: movq %r11, 8(%rax)
; SRC-NEXT: movq %rbx, (%rax)
-; SRC-NEXT: movq %rdx, 24(%rax)
+; SRC-NEXT: movq %r9, 24(%rax)
; SRC-NEXT: movq %rdi, 16(%rax)
; SRC-NEXT: popq %rbx
; SRC-NEXT: retq
@@ -194,46 +194,46 @@ define i256 @test1(i256 %a) nounwind {
; LIN-LABEL: test1:
; LIN: # %bb.0:
; LIN-NEXT: movq %rdi, %rax
-; LIN-NEXT: xorl %r9d, %r9d
+; LIN-NEXT: xorl %edi, %edi
; LIN-NEXT: movl $1, %r8d
; LIN-NEXT: addl %esi, %esi
; LIN-NEXT: leal 3(%rsi), %ecx
-; LIN-NEXT: movl $1, %edi
-; LIN-NEXT: shlq %cl, %rdi
+; LIN-NEXT: movl $1, %edx
+; LIN-NEXT: shlq %cl, %rdx
; LIN-NEXT: testb $64, %cl
-; LIN-NEXT: movq %rdi, %rdx
-; LIN-NEXT: cmovneq %r9, %rdx
+; LIN-NEXT: movq %rdx, %r9
+; LIN-NEXT: cmovneq %rdi, %r9
; LIN-NEXT: testb %cl, %cl
-; LIN-NEXT: cmovsq %r9, %rdx
-; LIN-NEXT: movq %rdx, (%rax)
-; LIN-NEXT: xorl %edx, %edx
+; LIN-NEXT: cmovsq %rdi, %r9
+; LIN-NEXT: movq %r9, (%rax)
+; LIN-NEXT: xorl %r9d, %r9d
; LIN-NEXT: # kill: def $cl killed $cl killed $ecx
-; LIN-NEXT: shldq %cl, %r8, %rdx
-; LIN-NEXT: cmovneq %rdi, %rdx
-; LIN-NEXT: cmovsq %r9, %rdx
-; LIN-NEXT: movq %rdx, 8(%rax)
-; LIN-NEXT: leal -125(%rsi), %r10d
-; LIN-NEXT: movl $1, %edx
-; LIN-NEXT: movl %r10d, %ecx
-; LIN-NEXT: shlq %cl, %rdx
-; LIN-NEXT: testb $64, %r10b
-; LIN-NEXT: movq %rdx, %rdi
-; LIN-NEXT: cmovneq %r9, %rdi
+; LIN-NEXT: shldq %cl, %r8, %r9
+; LIN-NEXT: cmovneq %rdx, %r9
+; LIN-NEXT: cmovsq %rdi, %r9
+; LIN-NEXT: movq %r9, 8(%rax)
+; LIN-NEXT: leal -125(%rsi), %edx
+; LIN-NEXT: movl $1, %r9d
+; LIN-NEXT: movl %edx, %ecx
+; LIN-NEXT: shlq %cl, %r9
+; LIN-NEXT: testb $64, %dl
+; LIN-NEXT: movq %r9, %r10
+; LIN-NEXT: cmovneq %rdi, %r10
; LIN-NEXT: movb $125, %cl
; LIN-NEXT: subb %sil, %cl
; LIN-NEXT: movl $1, %esi
-; LIN-NEXT: shrdq %cl, %r9, %rsi
+; LIN-NEXT: shrdq %cl, %rdi, %rsi
; LIN-NEXT: testb $64, %cl
-; LIN-NEXT: cmovneq %r9, %rsi
-; LIN-NEXT: cmovsq %rdi, %rsi
-; LIN-NEXT: cmoveq %r9, %rsi
+; LIN-NEXT: cmovneq %rdi, %rsi
+; LIN-NEXT: cmovsq %r10, %rsi
+; LIN-NEXT: cmoveq %rdi, %rsi
; LIN-NEXT: movq %rsi, 16(%rax)
; LIN-NEXT: xorl %esi, %esi
-; LIN-NEXT: movl %r10d, %ecx
+; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shldq %cl, %r8, %rsi
-; LIN-NEXT: cmovneq %rdx, %rsi
-; LIN-NEXT: cmovnsq %r9, %rsi
-; LIN-NEXT: cmoveq %r9, %rsi
+; LIN-NEXT: cmovneq %r9, %rsi
+; LIN-NEXT: cmovnsq %rdi, %rsi
+; LIN-NEXT: cmoveq %rdi, %rsi
; LIN-NEXT: movq %rsi, 24(%rax)
; LIN-NEXT: retq
%b = add i256 %a, 1
@@ -250,38 +250,38 @@ define i256 @test2(i256 %a) nounwind {
; ILP-LABEL: test2:
; ILP: # %bb.0:
; ILP-NEXT: movq %rdi, %rax
-; ILP-NEXT: xorl %r9d, %r9d
+; ILP-NEXT: xorl %edi, %edi
; ILP-NEXT: movq %rsi, %r11
; ILP-NEXT: negq %r11
; ILP-NEXT: movl $0, %r10d
; ILP-NEXT: sbbq %rdx, %r10
-; ILP-NEXT: movl $0, %edi
-; ILP-NEXT: sbbq %rcx, %rdi
-; ILP-NEXT: sbbq %r8, %r9
-; ILP-NEXT: andq %r8, %r9
-; ILP-NEXT: bsrq %r9, %r8
+; ILP-NEXT: movl $0, %r9d
+; ILP-NEXT: sbbq %rcx, %r9
+; ILP-NEXT: sbbq %r8, %rdi
+; ILP-NEXT: andq %r8, %rdi
+; ILP-NEXT: bsrq %rdi, %r8
; ILP-NEXT: andq %rdx, %r10
; ILP-NEXT: bsrq %r10, %rdx
; ILP-NEXT: xorq $63, %r8
-; ILP-NEXT: andq %rcx, %rdi
-; ILP-NEXT: bsrq %rdi, %rcx
+; ILP-NEXT: andq %rcx, %r9
+; ILP-NEXT: bsrq %r9, %rcx
; ILP-NEXT: xorq $63, %rcx
; ILP-NEXT: addq $64, %rcx
-; ILP-NEXT: testq %r9, %r9
+; ILP-NEXT: testq %rdi, %rdi
; ILP-NEXT: cmovneq %r8, %rcx
; ILP-NEXT: xorq $63, %rdx
; ILP-NEXT: andq %rsi, %r11
-; ILP-NEXT: movl $127, %r8d
-; ILP-NEXT: bsrq %r11, %rsi
-; ILP-NEXT: cmoveq %r8, %rsi
-; ILP-NEXT: xorq $63, %rsi
-; ILP-NEXT: addq $64, %rsi
+; ILP-NEXT: movl $127, %esi
+; ILP-NEXT: bsrq %r11, %r8
+; ILP-NEXT: cmoveq %rsi, %r8
+; ILP-NEXT: xorq $63, %r8
+; ILP-NEXT: addq $64, %r8
; ILP-NEXT: testq %r10, %r10
-; ILP-NEXT: cmovneq %rdx, %rsi
-; ILP-NEXT: subq $-128, %rsi
-; ILP-NEXT: orq %r9, %rdi
-; ILP-NEXT: cmovneq %rcx, %rsi
-; ILP-NEXT: movq %rsi, (%rax)
+; ILP-NEXT: cmovneq %rdx, %r8
+; ILP-NEXT: subq $-128, %r8
+; ILP-NEXT: orq %rdi, %r9
+; ILP-NEXT: cmovneq %rcx, %r8
+; ILP-NEXT: movq %r8, (%rax)
; ILP-NEXT: movq $0, 24(%rax)
; ILP-NEXT: movq $0, 16(%rax)
; ILP-NEXT: movq $0, 8(%rax)
@@ -290,38 +290,38 @@ define i256 @test2(i256 %a) nounwind {
; HYBRID-LABEL: test2:
; HYBRID: # %bb.0:
; HYBRID-NEXT: movq %rdi, %rax
-; HYBRID-NEXT: xorl %r9d, %r9d
+; HYBRID-NEXT: xorl %edi, %edi
; HYBRID-NEXT: movq %rsi, %r11
; HYBRID-NEXT: negq %r11
; HYBRID-NEXT: movl $0, %r10d
; HYBRID-NEXT: sbbq %rdx, %r10
-; HYBRID-NEXT: movl $0, %edi
-; HYBRID-NEXT: sbbq %rcx, %rdi
-; HYBRID-NEXT: sbbq %r8, %r9
-; HYBRID-NEXT: andq %r8, %r9
-; HYBRID-NEXT: bsrq %r9, %r8
+; HYBRID-NEXT: movl $0, %r9d
+; HYBRID-NEXT: sbbq %rcx, %r9
+; HYBRID-NEXT: sbbq %r8, %rdi
+; HYBRID-NEXT: andq %r8, %rdi
+; HYBRID-NEXT: bsrq %rdi, %r8
; HYBRID-NEXT: xorq $63, %r8
-; HYBRID-NEXT: andq %rcx, %rdi
-; HYBRID-NEXT: bsrq %rdi, %rcx
+; HYBRID-NEXT: andq %rcx, %r9
+; HYBRID-NEXT: bsrq %r9, %rcx
; HYBRID-NEXT: xorq $63, %rcx
; HYBRID-NEXT: addq $64, %rcx
-; HYBRID-NEXT: testq %r9, %r9
+; HYBRID-NEXT: testq %rdi, %rdi
; HYBRID-NEXT: cmovneq %r8, %rcx
; HYBRID-NEXT: andq %rdx, %r10
; HYBRID-NEXT: bsrq %r10, %rdx
; HYBRID-NEXT: xorq $63, %rdx
; HYBRID-NEXT: andq %rsi, %r11
-; HYBRID-NEXT: movl $127, %r8d
-; HYBRID-NEXT: bsrq %r11, %rsi
-; HYBRID-NEXT: cmoveq %r8, %rsi
-; HYBRID-NEXT: xorq $63, %rsi
-; HYBRID-NEXT: addq $64, %rsi
+; HYBRID-NEXT: movl $127, %esi
+; HYBRID-NEXT: bsrq %r11, %r8
+; HYBRID-NEXT: cmoveq %rsi, %r8
+; HYBRID-NEXT: xorq $63, %r8
+; HYBRID-NEXT: addq $64, %r8
; HYBRID-NEXT: testq %r10, %r10
-; HYBRID-NEXT: cmovneq %rdx, %rsi
-; HYBRID-NEXT: subq $-128, %rsi
-; HYBRID-NEXT: orq %r9, %rdi
-; HYBRID-NEXT: cmovneq %rcx, %rsi
-; HYBRID-NEXT: movq %rsi, (%rax)
+; HYBRID-NEXT: cmovneq %rdx, %r8
+; HYBRID-NEXT: subq $-128, %r8
+; HYBRID-NEXT: orq %rdi, %r9
+; HYBRID-NEXT: cmovneq %rcx, %r8
+; HYBRID-NEXT: movq %r8, (%rax)
; HYBRID-NEXT: movq $0, 24(%rax)
; HYBRID-NEXT: movq $0, 16(%rax)
; HYBRID-NEXT: movq $0, 8(%rax)
@@ -330,38 +330,38 @@ define i256 @test2(i256 %a) nounwind {
; BURR-LABEL: test2:
; BURR: # %bb.0:
; BURR-NEXT: movq %rdi, %rax
-; BURR-NEXT: xorl %r9d, %r9d
+; BURR-NEXT: xorl %edi, %edi
; BURR-NEXT: movq %rsi, %r11
; BURR-NEXT: negq %r11
; BURR-NEXT: movl $0, %r10d
; BURR-NEXT: sbbq %rdx, %r10
-; BURR-NEXT: movl $0, %edi
-; BURR-NEXT: sbbq %rcx, %rdi
-; BURR-NEXT: sbbq %r8, %r9
-; BURR-NEXT: andq %r8, %r9
-; BURR-NEXT: bsrq %r9, %r8
+; BURR-NEXT: movl $0, %r9d
+; BURR-NEXT: sbbq %rcx, %r9
+; BURR-NEXT: sbbq %r8, %rdi
+; BURR-NEXT: andq %r8, %rdi
+; BURR-NEXT: bsrq %rdi, %r8
; BURR-NEXT: xorq $63, %r8
-; BURR-NEXT: andq %rcx, %rdi
-; BURR-NEXT: bsrq %rdi, %rcx
+; BURR-NEXT: andq %rcx, %r9
+; BURR-NEXT: bsrq %r9, %rcx
; BURR-NEXT: xorq $63, %rcx
; BURR-NEXT: addq $64, %rcx
-; BURR-NEXT: testq %r9, %r9
+; BURR-NEXT: testq %rdi, %rdi
; BURR-NEXT: cmovneq %r8, %rcx
; BURR-NEXT: andq %rdx, %r10
; BURR-NEXT: bsrq %r10, %rdx
; BURR-NEXT: xorq $63, %rdx
; BURR-NEXT: andq %rsi, %r11
-; BURR-NEXT: movl $127, %r8d
-; BURR-NEXT: bsrq %r11, %rsi
-; BURR-NEXT: cmoveq %r8, %rsi
-; BURR-NEXT: xorq $63, %rsi
-; BURR-NEXT: addq $64, %rsi
+; BURR-NEXT: movl $127, %esi
+; BURR-NEXT: bsrq %r11, %r8
+; BURR-NEXT: cmoveq %rsi, %r8
+; BURR-NEXT: xorq $63, %r8
+; BURR-NEXT: addq $64, %r8
; BURR-NEXT: testq %r10, %r10
-; BURR-NEXT: cmovneq %rdx, %rsi
-; BURR-NEXT: subq $-128, %rsi
-; BURR-NEXT: orq %r9, %rdi
-; BURR-NEXT: cmovneq %rcx, %rsi
-; BURR-NEXT: movq %rsi, (%rax)
+; BURR-NEXT: cmovneq %rdx, %r8
+; BURR-NEXT: subq $-128, %r8
+; BURR-NEXT: orq %rdi, %r9
+; BURR-NEXT: cmovneq %rcx, %r8
+; BURR-NEXT: movq %r8, (%rax)
; BURR-NEXT: movq $0, 24(%rax)
; BURR-NEXT: movq $0, 16(%rax)
; BURR-NEXT: movq $0, 8(%rax)
@@ -391,17 +391,17 @@ define i256 @test2(i256 %a) nounwind {
; SRC-NEXT: cmovneq %rcx, %rdx
; SRC-NEXT: bsrq %r10, %rcx
; SRC-NEXT: xorq $63, %rcx
-; SRC-NEXT: bsrq %r11, %r8
-; SRC-NEXT: movl $127, %esi
-; SRC-NEXT: cmovneq %r8, %rsi
-; SRC-NEXT: xorq $63, %rsi
-; SRC-NEXT: addq $64, %rsi
+; SRC-NEXT: bsrq %r11, %rsi
+; SRC-NEXT: movl $127, %r8d
+; SRC-NEXT: cmovneq %rsi, %r8
+; SRC-NEXT: xorq $63, %r8
+; SRC-NEXT: addq $64, %r8
; SRC-NEXT: testq %r10, %r10
-; SRC-NEXT: cmovneq %rcx, %rsi
-; SRC-NEXT: subq $-128, %rsi
+; SRC-NEXT: cmovneq %rcx, %r8
+; SRC-NEXT: subq $-128, %r8
; SRC-NEXT: orq %r9, %rdi
-; SRC-NEXT: cmovneq %rdx, %rsi
-; SRC-NEXT: movq %rsi, (%rax)
+; SRC-NEXT: cmovneq %rdx, %r8
+; SRC-NEXT: movq %r8, (%rax)
; SRC-NEXT: movq $0, 24(%rax)
; SRC-NEXT: movq $0, 16(%rax)
; SRC-NEXT: movq $0, 8(%rax)
@@ -418,30 +418,30 @@ define i256 @test2(i256 %a) nounwind {
; LIN-NEXT: cmovneq %rsi, %rdi
; LIN-NEXT: xorq $63, %rdi
; LIN-NEXT: addq $64, %rdi
-; LIN-NEXT: xorl %r9d, %r9d
-; LIN-NEXT: movl $0, %esi
-; LIN-NEXT: sbbq %rdx, %rsi
-; LIN-NEXT: andq %rdx, %rsi
-; LIN-NEXT: bsrq %rsi, %rdx
+; LIN-NEXT: xorl %esi, %esi
+; LIN-NEXT: movl $0, %r9d
+; LIN-NEXT: sbbq %rdx, %r9
+; LIN-NEXT: andq %rdx, %r9
+; LIN-NEXT: bsrq %r9, %rdx
; LIN-NEXT: xorq $63, %rdx
-; LIN-NEXT: testq %rsi, %rsi
+; LIN-NEXT: testq %r9, %r9
; LIN-NEXT: cmoveq %rdi, %rdx
; LIN-NEXT: subq $-128, %rdx
-; LIN-NEXT: movl $0, %esi
-; LIN-NEXT: sbbq %rcx, %rsi
-; LIN-NEXT: andq %rcx, %rsi
-; LIN-NEXT: bsrq %rsi, %rcx
+; LIN-NEXT: movl $0, %edi
+; LIN-NEXT: sbbq %rcx, %rdi
+; LIN-NEXT: andq %rcx, %rdi
+; LIN-NEXT: bsrq %rdi, %rcx
; LIN-NEXT: xorq $63, %rcx
; LIN-NEXT: addq $64, %rcx
-; LIN-NEXT: sbbq %r8, %r9
-; LIN-NEXT: andq %r8, %r9
-; LIN-NEXT: bsrq %r9, %rdi
-; LIN-NEXT: xorq $63, %rdi
-; LIN-NEXT: testq %r9, %r9
-; LIN-NEXT: cmoveq %rcx, %rdi
-; LIN-NEXT: orq %rsi, %r9
-; LIN-NEXT: cmoveq %rdx, %rdi
-; LIN-NEXT: movq %rdi, (%rax)
+; LIN-NEXT: sbbq %r8, %rsi
+; LIN-NEXT: andq %r8, %rsi
+; LIN-NEXT: bsrq %rsi, %r8
+; LIN-NEXT: xorq $63, %r8
+; LIN-NEXT: testq %rsi, %rsi
+; LIN-NEXT: cmoveq %rcx, %r8
+; LIN-NEXT: orq %rdi, %rsi
+; LIN-NEXT: cmoveq %rdx, %r8
+; LIN-NEXT: movq %r8, (%rax)
; LIN-NEXT: movq $0, 8(%rax)
; LIN-NEXT: movq $0, 16(%rax)
; LIN-NEXT: movq $0, 24(%rax)
@@ -457,41 +457,41 @@ define i256 @test3(i256 %n) nounwind {
; ILP: # %bb.0:
; ILP-NEXT: pushq %rbx
; ILP-NEXT: movq %rdi, %rax
-; ILP-NEXT: xorl %edi, %edi
-; ILP-NEXT: movq %rsi, %r9
-; ILP-NEXT: negq %r9
+; ILP-NEXT: xorl %r9d, %r9d
+; ILP-NEXT: movq %rsi, %rdi
+; ILP-NEXT: negq %rdi
; ILP-NEXT: movl $0, %r10d
; ILP-NEXT: sbbq %rdx, %r10
; ILP-NEXT: movl $0, %r11d
; ILP-NEXT: sbbq %rcx, %r11
-; ILP-NEXT: sbbq %r8, %rdi
+; ILP-NEXT: sbbq %r8, %r9
; ILP-NEXT: notq %r8
-; ILP-NEXT: andq %rdi, %r8
+; ILP-NEXT: andq %r9, %r8
; ILP-NEXT: bsrq %r8, %rbx
; ILP-NEXT: notq %rdx
; ILP-NEXT: andq %r10, %rdx
-; ILP-NEXT: bsrq %rdx, %r10
+; ILP-NEXT: bsrq %rdx, %r9
; ILP-NEXT: notq %rsi
; ILP-NEXT: xorq $63, %rbx
; ILP-NEXT: notq %rcx
; ILP-NEXT: andq %r11, %rcx
-; ILP-NEXT: bsrq %rcx, %rdi
-; ILP-NEXT: xorq $63, %rdi
-; ILP-NEXT: addq $64, %rdi
-; ILP-NEXT: testq %r8, %r8
-; ILP-NEXT: cmovneq %rbx, %rdi
+; ILP-NEXT: bsrq %rcx, %r10
; ILP-NEXT: xorq $63, %r10
-; ILP-NEXT: andq %r9, %rsi
-; ILP-NEXT: movl $127, %ebx
+; ILP-NEXT: addq $64, %r10
+; ILP-NEXT: testq %r8, %r8
+; ILP-NEXT: cmovneq %rbx, %r10
+; ILP-NEXT: xorq $63, %r9
+; ILP-NEXT: andq %rdi, %rsi
+; ILP-NEXT: movl $127, %edi
; ILP-NEXT: bsrq %rsi, %rsi
-; ILP-NEXT: cmoveq %rbx, %rsi
+; ILP-NEXT: cmoveq %rdi, %rsi
; ILP-NEXT: xorq $63, %rsi
; ILP-NEXT: addq $64, %rsi
; ILP-NEXT: testq %rdx, %rdx
-; ILP-NEXT: cmovneq %r10, %rsi
+; ILP-NEXT: cmovneq %r9, %rsi
; ILP-NEXT: subq $-128, %rsi
; ILP-NEXT: orq %r8, %rcx
-; ILP-NEXT: cmovneq %rdi, %rsi
+; ILP-NEXT: cmovneq %r10, %rsi
; ILP-NEXT: movq %rsi, (%rax)
; ILP-NEXT: movq $0, 24(%rax)
; ILP-NEXT: movq $0, 16(%rax)
@@ -503,41 +503,41 @@ define i256 @test3(i256 %n) nounwind {
; HYBRID: # %bb.0:
; HYBRID-NEXT: pushq %rbx
; HYBRID-NEXT: movq %rdi, %rax
-; HYBRID-NEXT: xorl %edi, %edi
-; HYBRID-NEXT: movq %rsi, %r9
-; HYBRID-NEXT: negq %r9
+; HYBRID-NEXT: xorl %r9d, %r9d
+; HYBRID-NEXT: movq %rsi, %rdi
+; HYBRID-NEXT: negq %rdi
; HYBRID-NEXT: movl $0, %r10d
; HYBRID-NEXT: sbbq %rdx, %r10
; HYBRID-NEXT: movl $0, %r11d
; HYBRID-NEXT: sbbq %rcx, %r11
-; HYBRID-NEXT: sbbq %r8, %rdi
+; HYBRID-NEXT: sbbq %r8, %r9
; HYBRID-NEXT: notq %r8
-; HYBRID-NEXT: andq %rdi, %r8
+; HYBRID-NEXT: andq %r9, %r8
; HYBRID-NEXT: bsrq %r8, %rbx
; HYBRID-NEXT: xorq $63, %rbx
; HYBRID-NEXT: notq %rcx
; HYBRID-NEXT: andq %r11, %rcx
-; HYBRID-NEXT: bsrq %rcx, %rdi
-; HYBRID-NEXT: xorq $63, %rdi
-; HYBRID-NEXT: addq $64, %rdi
+; HYBRID-NEXT: bsrq %rcx, %r9
+; HYBRID-NEXT: xorq $63, %r9
+; HYBRID-NEXT: addq $64, %r9
; HYBRID-NEXT: testq %r8, %r8
-; HYBRID-NEXT: cmovneq %rbx, %rdi
+; HYBRID-NEXT: cmovneq %rbx, %r9
; HYBRID-NEXT: notq %rdx
; HYBRID-NEXT: andq %r10, %rdx
-; HYBRID-NEXT: bsrq %rdx, %rbx
-; HYBRID-NEXT: xorq $63, %rbx
+; HYBRID-NEXT: bsrq %rdx, %r10
+; HYBRID-NEXT: xorq $63, %r10
; HYBRID-NEXT: notq %rsi
-; HYBRID-NEXT: andq %r9, %rsi
-; HYBRID-NEXT: movl $127, %r9d
+; HYBRID-NEXT: andq %rdi, %rsi
+; HYBRID-NEXT: movl $127, %edi
; HYBRID-NEXT: bsrq %rsi, %rsi
-; HYBRID-NEXT: cmoveq %r9, %rsi
+; HYBRID-NEXT: cmoveq %rdi, %rsi
; HYBRID-NEXT: xorq $63, %rsi
; HYBRID-NEXT: addq $64, %rsi
; HYBRID-NEXT: testq %rdx, %rdx
-; HYBRID-NEXT: cmovneq %rbx, %rsi
+; HYBRID-NEXT: cmovneq %r10, %rsi
; HYBRID-NEXT: subq $-128, %rsi
; HYBRID-NEXT: orq %r8, %rcx
-; HYBRID-NEXT: cmovneq %rdi, %rsi
+; HYBRID-NEXT: cmovneq %r9, %rsi
; HYBRID-NEXT: movq %rsi, (%rax)
; HYBRID-NEXT: movq $0, 24(%rax)
; HYBRID-NEXT: movq $0, 16(%rax)
@@ -549,41 +549,41 @@ define i256 @test3(i256 %n) nounwind {
; BURR: # %bb.0:
; BURR-NEXT: pushq %rbx
; BURR-NEXT: movq %rdi, %rax
-; BURR-NEXT: xorl %edi, %edi
-; BURR-NEXT: movq %rsi, %r9
-; BURR-NEXT: negq %r9
+; BURR-NEXT: xorl %r9d, %r9d
+; BURR-NEXT: movq %rsi, %rdi
+; BURR-NEXT: negq %rdi
; BURR-NEXT: movl $0, %r10d
; BURR-NEXT: sbbq %rdx, %r10
; BURR-NEXT: movl $0, %r11d
; BURR-NEXT: sbbq %rcx, %r11
-; BURR-NEXT: sbbq %r8, %rdi
+; BURR-NEXT: sbbq %r8, %r9
; BURR-NEXT: notq %r8
-; BURR-NEXT: andq %rdi, %r8
+; BURR-NEXT: andq %r9, %r8
; BURR-NEXT: bsrq %r8, %rbx
; BURR-NEXT: xorq $63, %rbx
; BURR-NEXT: notq %rcx
; BURR-NEXT: andq %r11, %rcx
-; BURR-NEXT: bsrq %rcx, %rdi
-; BURR-NEXT: xorq $63, %rdi
-; BURR-NEXT: addq $64, %rdi
+; BURR-NEXT: bsrq %rcx, %r9
+; BURR-NEXT: xorq $63, %r9
+; BURR-NEXT: addq $64, %r9
; BURR-NEXT: testq %r8, %r8
-; BURR-NEXT: cmovneq %rbx, %rdi
+; BURR-NEXT: cmovneq %rbx, %r9
; BURR-NEXT: notq %rdx
; BURR-NEXT: andq %r10, %rdx
-; BURR-NEXT: bsrq %rdx, %rbx
-; BURR-NEXT: xorq $63, %rbx
+; BURR-NEXT: bsrq %rdx, %r10
+; BURR-NEXT: xorq $63, %r10
; BURR-NEXT: notq %rsi
-; BURR-NEXT: andq %r9, %rsi
-; BURR-NEXT: movl $127, %r9d
+; BURR-NEXT: andq %rdi, %rsi
+; BURR-NEXT: movl $127, %edi
; BURR-NEXT: bsrq %rsi, %rsi
-; BURR-NEXT: cmoveq %r9, %rsi
+; BURR-NEXT: cmoveq %rdi, %rsi
; BURR-NEXT: xorq $63, %rsi
; BURR-NEXT: addq $64, %rsi
; BURR-NEXT: testq %rdx, %rdx
-; BURR-NEXT: cmovneq %rbx, %rsi
+; BURR-NEXT: cmovneq %r10, %rsi
; BURR-NEXT: subq $-128, %rsi
; BURR-NEXT: orq %r8, %rcx
-; BURR-NEXT: cmovneq %rdi, %rsi
+; BURR-NEXT: cmovneq %r9, %rsi
; BURR-NEXT: movq %rsi, (%rax)
; BURR-NEXT: movq $0, 24(%rax)
; BURR-NEXT: movq $0, 16(%rax)
@@ -594,42 +594,42 @@ define i256 @test3(i256 %n) nounwind {
; SRC-LABEL: test3:
; SRC: # %bb.0:
; SRC-NEXT: movq %rdi, %rax
-; SRC-NEXT: movq %rsi, %r9
-; SRC-NEXT: notq %r9
-; SRC-NEXT: xorl %r10d, %r10d
+; SRC-NEXT: movq %rsi, %rdi
+; SRC-NEXT: notq %rdi
+; SRC-NEXT: xorl %r9d, %r9d
; SRC-NEXT: negq %rsi
-; SRC-NEXT: movl $0, %r11d
-; SRC-NEXT: sbbq %rdx, %r11
+; SRC-NEXT: movl $0, %r10d
+; SRC-NEXT: sbbq %rdx, %r10
; SRC-NEXT: notq %rdx
-; SRC-NEXT: movl $0, %edi
-; SRC-NEXT: sbbq %rcx, %rdi
+; SRC-NEXT: movl $0, %r11d
+; SRC-NEXT: sbbq %rcx, %r11
; SRC-NEXT: notq %rcx
-; SRC-NEXT: sbbq %r8, %r10
+; SRC-NEXT: sbbq %r8, %r9
; SRC-NEXT: notq %r8
-; SRC-NEXT: andq %r11, %rdx
-; SRC-NEXT: andq %rdi, %rcx
-; SRC-NEXT: andq %r10, %r8
-; SRC-NEXT: andq %r9, %rsi
-; SRC-NEXT: bsrq %r8, %r9
-; SRC-NEXT: xorq $63, %r9
-; SRC-NEXT: bsrq %rcx, %rdi
+; SRC-NEXT: andq %r10, %rdx
+; SRC-NEXT: andq %r11, %rcx
+; SRC-NEXT: andq %r9, %r8
+; SRC-NEXT: andq %rdi, %rsi
+; SRC-NEXT: bsrq %r8, %rdi
; SRC-NEXT: xorq $63, %rdi
-; SRC-NEXT: addq $64, %rdi
-; SRC-NEXT: testq %r8, %r8
-; SRC-NEXT: cmovneq %r9, %rdi
-; SRC-NEXT: bsrq %rdx, %r9
+; SRC-NEXT: bsrq %rcx, %r9
; SRC-NEXT: xorq $63, %r9
-; SRC-NEXT: bsrq %rsi, %r10
-; SRC-NEXT: movl $127, %esi
-; SRC-NEXT: cmovneq %r10, %rsi
-; SRC-NEXT: xorq $63, %rsi
-; SRC-NEXT: addq $64, %rsi
+; SRC-NEXT: addq $64, %r9
+; SRC-NEXT: testq %r8, %r8
+; SRC-NEXT: cmovneq %rdi, %r9
+; SRC-NEXT: bsrq %rdx, %rdi
+; SRC-NEXT: xorq $63, %rdi
+; SRC-NEXT: bsrq %rsi, %rsi
+; SRC-NEXT: movl $127, %r10d
+; SRC-NEXT: cmovneq %rsi, %r10
+; SRC-NEXT: xorq $63, %r10
+; SRC-NEXT: addq $64, %r10
; SRC-NEXT: testq %rdx, %rdx
-; SRC-NEXT: cmovneq %r9, %rsi
-; SRC-NEXT: subq $-128, %rsi
+; SRC-NEXT: cmovneq %rdi, %r10
+; SRC-NEXT: subq $-128, %r10
; SRC-NEXT: orq %rcx, %r8
-; SRC-NEXT: cmovneq %rdi, %rsi
-; SRC-NEXT: movq %rsi, (%rax)
+; SRC-NEXT: cmovneq %r9, %r10
+; SRC-NEXT: movq %r10, (%rax)
; SRC-NEXT: movq $0, 24(%rax)
; SRC-NEXT: movq $0, 16(%rax)
; SRC-NEXT: movq $0, 8(%rax)
@@ -643,11 +643,11 @@ define i256 @test3(i256 %n) nounwind {
; LIN-NEXT: notq %rsi
; LIN-NEXT: andq %rdi, %rsi
; LIN-NEXT: bsrq %rsi, %rsi
-; LIN-NEXT: movl $127, %edi
-; LIN-NEXT: cmovneq %rsi, %rdi
-; LIN-NEXT: xorq $63, %rdi
-; LIN-NEXT: addq $64, %rdi
-; LIN-NEXT: xorl %r9d, %r9d
+; LIN-NEXT: movl $127, %r9d
+; LIN-NEXT: cmovneq %rsi, %r9
+; LIN-NEXT: xorq $63, %r9
+; LIN-NEXT: addq $64, %r9
+; LIN-NEXT: xorl %edi, %edi
; LIN-NEXT: movl $0, %esi
; LIN-NEXT: sbbq %rdx, %rsi
; LIN-NEXT: notq %rdx
@@ -655,7 +655,7 @@ define i256 @test3(i256 %n) nounwind {
; LIN-NEXT: bsrq %rdx, %rsi
; LIN-NEXT: xorq $63, %rsi
; LIN-NEXT: testq %rdx, %rdx
-; LIN-NEXT: cmoveq %rdi, %rsi
+; LIN-NEXT: cmoveq %r9, %rsi
; LIN-NEXT: subq $-128, %rsi
; LIN-NEXT: movl $0, %edx
; LIN-NEXT: sbbq %rcx, %rdx
@@ -664,9 +664,9 @@ define i256 @test3(i256 %n) nounwind {
; LIN-NEXT: bsrq %rcx, %rdx
; LIN-NEXT: xorq $63, %rdx
; LIN-NEXT: addq $64, %rdx
-; LIN-NEXT: sbbq %r8, %r9
+; LIN-NEXT: sbbq %r8, %rdi
; LIN-NEXT: notq %r8
-; LIN-NEXT: andq %r9, %r8
+; LIN-NEXT: andq %rdi, %r8
; LIN-NEXT: bsrq %r8, %rdi
; LIN-NEXT: xorq $63, %rdi
; LIN-NEXT: testq %r8, %r8
@@ -777,27 +777,27 @@ define i256 @PR25498(i256 %a) nounwind {
; ILP: # %bb.0:
; ILP-NEXT: pushq %rbx
; ILP-NEXT: movq %rdi, %rax
-; ILP-NEXT: xorl %r9d, %r9d
+; ILP-NEXT: xorl %edi, %edi
; ILP-NEXT: movq %rsi, %rbx
; ILP-NEXT: negq %rbx
; ILP-NEXT: movl $0, %r11d
; ILP-NEXT: sbbq %rdx, %r11
+; ILP-NEXT: movl $0, %r9d
+; ILP-NEXT: sbbq %rcx, %r9
; ILP-NEXT: movl $0, %r10d
-; ILP-NEXT: sbbq %rcx, %r10
-; ILP-NEXT: movl $0, %edi
-; ILP-NEXT: sbbq %r8, %rdi
+; ILP-NEXT: sbbq %r8, %r10
; ILP-NEXT: orq %r8, %rdx
; ILP-NEXT: orq %rcx, %rsi
; ILP-NEXT: orq %rdx, %rsi
; ILP-NEXT: je .LBB4_1
; ILP-NEXT: # %bb.2: # %cond.false
; ILP-NEXT: bsrq %r11, %rdx
-; ILP-NEXT: bsrq %rdi, %rcx
+; ILP-NEXT: bsrq %r10, %rcx
; ILP-NEXT: xorq $63, %rcx
-; ILP-NEXT: bsrq %r10, %rsi
+; ILP-NEXT: bsrq %r9, %rsi
; ILP-NEXT: xorq $63, %rsi
; ILP-NEXT: addq $64, %rsi
-; ILP-NEXT: testq %rdi, %rdi
+; ILP-NEXT: testq %r10, %r10
; ILP-NEXT: cmovneq %rcx, %rsi
; ILP-NEXT: xorq $63, %rdx
; ILP-NEXT: bsrq %rbx, %rcx
@@ -806,17 +806,17 @@ define i256 @PR25498(i256 %a) nounwind {
; ILP-NEXT: testq %r11, %r11
; ILP-NEXT: cmovneq %rdx, %rcx
; ILP-NEXT: subq $-128, %rcx
-; ILP-NEXT: xorl %r9d, %r9d
-; ILP-NEXT: orq %rdi, %r10
+; ILP-NEXT: xorl %edi, %edi
+; ILP-NEXT: orq %r10, %r9
; ILP-NEXT: cmovneq %rsi, %rcx
; ILP-NEXT: jmp .LBB4_3
; ILP-NEXT: .LBB4_1:
; ILP-NEXT: movl $256, %ecx # imm = 0x100
; ILP-NEXT: .LBB4_3: # %cond.end
; ILP-NEXT: movq %rcx, (%rax)
-; ILP-NEXT: movq %r9, 8(%rax)
-; ILP-NEXT: movq %r9, 16(%rax)
-; ILP-NEXT: movq %r9, 24(%rax)
+; ILP-NEXT: movq %rdi, 8(%rax)
+; ILP-NEXT: movq %rdi, 16(%rax)
+; ILP-NEXT: movq %rdi, 24(%rax)
; ILP-NEXT: popq %rbx
; ILP-NEXT: retq
;
@@ -824,26 +824,26 @@ define i256 @PR25498(i256 %a) nounwind {
; HYBRID: # %bb.0:
; HYBRID-NEXT: pushq %rbx
; HYBRID-NEXT: movq %rdi, %rax
-; HYBRID-NEXT: xorl %r9d, %r9d
+; HYBRID-NEXT: xorl %edi, %edi
; HYBRID-NEXT: movq %rsi, %rbx
; HYBRID-NEXT: negq %rbx
; HYBRID-NEXT: movl $0, %r11d
; HYBRID-NEXT: sbbq %rdx, %r11
+; HYBRID-NEXT: movl $0, %r9d
+; HYBRID-NEXT: sbbq %rcx, %r9
; HYBRID-NEXT: movl $0, %r10d
-; HYBRID-NEXT: sbbq %rcx, %r10
-; HYBRID-NEXT: movl $0, %edi
-; HYBRID-NEXT: sbbq %r8, %rdi
+; HYBRID-NEXT: sbbq %r8, %r10
; HYBRID-NEXT: orq %r8, %rdx
; HYBRID-NEXT: orq %rcx, %rsi
; HYBRID-NEXT: orq %rdx, %rsi
; HYBRID-NEXT: je .LBB4_1
; HYBRID-NEXT: # %bb.2: # %cond.false
-; HYBRID-NEXT: bsrq %rdi, %rcx
+; HYBRID-NEXT: bsrq %r10, %rcx
; HYBRID-NEXT: xorq $63, %rcx
-; HYBRID-NEXT: bsrq %r10, %rdx
+; HYBRID-NEXT: bsrq %r9, %rdx
; HYBRID-NEXT: xorq $63, %rdx
; HYBRID-NEXT: addq $64, %rdx
-; HYBRID-NEXT: testq %rdi, %rdi
+; HYBRID-NEXT: testq %r10, %r10
; HYBRID-NEXT: cmovneq %rcx, %rdx
; HYBRID-NEXT: bsrq %r11, %rsi
; HYBRID-NEXT: xorq $63, %rsi
@@ -853,17 +853,17 @@ define i256 @PR25498(i256 %a) nounwind {
; HYBRID-NEXT: testq %r11, %r11
; HYBRID-NEXT: cmovneq %rsi, %rcx
; HYBRID-NEXT: subq $-128, %rcx
-; HYBRID-NEXT: orq %rdi, %r10
+; HYBRID-NEXT: orq %r10, %r9
; HYBRID-NEXT: cmovneq %rdx, %rcx
-; HYBRID-NEXT: xorl %r9d, %r9d
+; HYBRID-NEXT: xorl %edi, %edi
; HYBRID-NEXT: jmp .LBB4_3
; HYBRID-NEXT: .LBB4_1:
; HYBRID-NEXT: movl $256, %ecx # imm = 0x100
; HYBRID-NEXT: .LBB4_3: # %cond.end
; HYBRID-NEXT: movq %rcx, (%rax)
-; HYBRID-NEXT: movq %r9, 8(%rax)
-; HYBRID-NEXT: movq %r9, 16(%rax)
-; HYBRID-NEXT: movq %r9, 24(%rax)
+; HYBRID-NEXT: movq %rdi, 8(%rax)
+; HYBRID-NEXT: movq %rdi, 16(%rax)
+; HYBRID-NEXT: movq %rdi, 24(%rax)
; HYBRID-NEXT: popq %rbx
; HYBRID-NEXT: retq
;
@@ -871,26 +871,26 @@ define i256 @PR25498(i256 %a) nounwind {
; BURR: # %bb.0:
; BURR-NEXT: pushq %rbx
; BURR-NEXT: movq %rdi, %rax
-; BURR-NEXT: xorl %r9d, %r9d
+; BURR-NEXT: xorl %edi, %edi
; BURR-NEXT: movq %rsi, %rbx
; BURR-NEXT: negq %rbx
; BURR-NEXT: movl $0, %r11d
; BURR-NEXT: sbbq %rdx, %r11
+; BURR-NEXT: movl $0, %r9d
+; BURR-NEXT: sbbq %rcx, %r9
; BURR-NEXT: movl $0, %r10d
-; BURR-NEXT: sbbq %rcx, %r10
-; BURR-NEXT: movl $0, %edi
-; BURR-NEXT: sbbq %r8, %rdi
+; BURR-NEXT: sbbq %r8, %r10
; BURR-NEXT: orq %r8, %rdx
; BURR-NEXT: orq %rcx, %rsi
; BURR-NEXT: orq %rdx, %rsi
; BURR-NEXT: je .LBB4_1
; BURR-NEXT: # %bb.2: # %cond.false
-; BURR-NEXT: bsrq %rdi, %rcx
+; BURR-NEXT: bsrq %r10, %rcx
; BURR-NEXT: xorq $63, %rcx
-; BURR-NEXT: bsrq %r10, %rdx
+; BURR-NEXT: bsrq %r9, %rdx
; BURR-NEXT: xorq $63, %rdx
; BURR-NEXT: addq $64, %rdx
-; BURR-NEXT: testq %rdi, %rdi
+; BURR-NEXT: testq %r10, %r10
; BURR-NEXT: cmovneq %rcx, %rdx
; BURR-NEXT: bsrq %r11, %rsi
; BURR-NEXT: xorq $63, %rsi
@@ -900,17 +900,17 @@ define i256 @PR25498(i256 %a) nounwind {
; BURR-NEXT: testq %r11, %r11
; BURR-NEXT: cmovneq %rsi, %rcx
; BURR-NEXT: subq $-128, %rcx
-; BURR-NEXT: orq %rdi, %r10
+; BURR-NEXT: orq %r10, %r9
; BURR-NEXT: cmovneq %rdx, %rcx
-; BURR-NEXT: xorl %r9d, %r9d
+; BURR-NEXT: xorl %edi, %edi
; BURR-NEXT: jmp .LBB4_3
; BURR-NEXT: .LBB4_1:
; BURR-NEXT: movl $256, %ecx # imm = 0x100
; BURR-NEXT: .LBB4_3: # %cond.end
; BURR-NEXT: movq %rcx, (%rax)
-; BURR-NEXT: movq %r9, 8(%rax)
-; BURR-NEXT: movq %r9, 16(%rax)
-; BURR-NEXT: movq %r9, 24(%rax)
+; BURR-NEXT: movq %rdi, 8(%rax)
+; BURR-NEXT: movq %rdi, 16(%rax)
+; BURR-NEXT: movq %rdi, 24(%rax)
; BURR-NEXT: popq %rbx
; BURR-NEXT: retq
;
@@ -918,26 +918,26 @@ define i256 @PR25498(i256 %a) nounwind {
; SRC: # %bb.0:
; SRC-NEXT: pushq %rbx
; SRC-NEXT: movq %rdi, %rax
-; SRC-NEXT: xorl %r9d, %r9d
+; SRC-NEXT: xorl %edi, %edi
; SRC-NEXT: movq %rsi, %rbx
; SRC-NEXT: negq %rbx
; SRC-NEXT: movl $0, %r11d
; SRC-NEXT: sbbq %rdx, %r11
+; SRC-NEXT: movl $0, %r9d
+; SRC-NEXT: sbbq %rcx, %r9
; SRC-NEXT: movl $0, %r10d
-; SRC-NEXT: sbbq %rcx, %r10
-; SRC-NEXT: movl $0, %edi
-; SRC-NEXT: sbbq %r8, %rdi
+; SRC-NEXT: sbbq %r8, %r10
; SRC-NEXT: orq %r8, %rdx
; SRC-NEXT: orq %rcx, %rsi
; SRC-NEXT: orq %rdx, %rsi
; SRC-NEXT: je .LBB4_1
; SRC-NEXT: # %bb.2: # %cond.false
-; SRC-NEXT: bsrq %rdi, %rcx
+; SRC-NEXT: bsrq %r10, %rcx
; SRC-NEXT: xorq $63, %rcx
-; SRC-NEXT: bsrq %r10, %rdx
+; SRC-NEXT: bsrq %r9, %rdx
; SRC-NEXT: xorq $63, %rdx
; SRC-NEXT: addq $64, %rdx
-; SRC-NEXT: testq %rdi, %rdi
+; SRC-NEXT: testq %r10, %r10
; SRC-NEXT: cmovneq %rcx, %rdx
; SRC-NEXT: bsrq %r11, %rsi
; SRC-NEXT: xorq $63, %rsi
@@ -947,17 +947,17 @@ define i256 @PR25498(i256 %a) nounwind {
; SRC-NEXT: testq %r11, %r11
; SRC-NEXT: cmovneq %rsi, %rcx
; SRC-NEXT: subq $-128, %rcx
-; SRC-NEXT: orq %rdi, %r10
+; SRC-NEXT: orq %r10, %r9
; SRC-NEXT: cmovneq %rdx, %rcx
-; SRC-NEXT: xorl %r9d, %r9d
+; SRC-NEXT: xorl %edi, %edi
; SRC-NEXT: jmp .LBB4_3
; SRC-NEXT: .LBB4_1:
; SRC-NEXT: movl $256, %ecx # imm = 0x100
; SRC-NEXT: .LBB4_3: # %cond.end
; SRC-NEXT: movq %rcx, (%rax)
-; SRC-NEXT: movq %r9, 8(%rax)
-; SRC-NEXT: movq %r9, 16(%rax)
-; SRC-NEXT: movq %r9, 24(%rax)
+; SRC-NEXT: movq %rdi, 8(%rax)
+; SRC-NEXT: movq %rdi, 16(%rax)
+; SRC-NEXT: movq %rdi, 24(%rax)
; SRC-NEXT: popq %rbx
; SRC-NEXT: retq
;
@@ -967,13 +967,13 @@ define i256 @PR25498(i256 %a) nounwind {
; LIN-NEXT: movq %rdi, %rax
; LIN-NEXT: movq %rsi, %rbx
; LIN-NEXT: negq %rbx
-; LIN-NEXT: xorl %r9d, %r9d
-; LIN-NEXT: movl $0, %edi
-; LIN-NEXT: sbbq %rdx, %rdi
-; LIN-NEXT: movl $0, %r10d
-; LIN-NEXT: sbbq %rcx, %r10
+; LIN-NEXT: xorl %edi, %edi
; LIN-NEXT: movl $0, %r11d
-; LIN-NEXT: sbbq %r8, %r11
+; LIN-NEXT: sbbq %rdx, %r11
+; LIN-NEXT: movl $0, %r9d
+; LIN-NEXT: sbbq %rcx, %r9
+; LIN-NEXT: movl $0, %r10d
+; LIN-NEXT: sbbq %r8, %r10
; LIN-NEXT: orq %rcx, %rsi
; LIN-NEXT: orq %r8, %rdx
; LIN-NEXT: orq %rsi, %rdx
@@ -982,29 +982,29 @@ define i256 @PR25498(i256 %a) nounwind {
; LIN-NEXT: bsrq %rbx, %rcx
; LIN-NEXT: xorq $63, %rcx
; LIN-NEXT: addq $64, %rcx
-; LIN-NEXT: bsrq %rdi, %rdx
+; LIN-NEXT: bsrq %r11, %rdx
; LIN-NEXT: xorq $63, %rdx
-; LIN-NEXT: testq %rdi, %rdi
+; LIN-NEXT: testq %r11, %r11
; LIN-NEXT: cmoveq %rcx, %rdx
; LIN-NEXT: subq $-128, %rdx
-; LIN-NEXT: bsrq %r10, %rsi
+; LIN-NEXT: bsrq %r9, %rsi
; LIN-NEXT: xorq $63, %rsi
; LIN-NEXT: addq $64, %rsi
-; LIN-NEXT: bsrq %r11, %rcx
+; LIN-NEXT: bsrq %r10, %rcx
; LIN-NEXT: xorq $63, %rcx
-; LIN-NEXT: testq %r11, %r11
+; LIN-NEXT: testq %r10, %r10
; LIN-NEXT: cmoveq %rsi, %rcx
-; LIN-NEXT: orq %r11, %r10
+; LIN-NEXT: orq %r10, %r9
; LIN-NEXT: cmoveq %rdx, %rcx
-; LIN-NEXT: xorl %r9d, %r9d
+; LIN-NEXT: xorl %edi, %edi
; LIN-NEXT: jmp .LBB4_3
; LIN-NEXT: .LBB4_1:
; LIN-NEXT: movl $256, %ecx # imm = 0x100
; LIN-NEXT: .LBB4_3: # %cond.end
; LIN-NEXT: movq %rcx, (%rax)
-; LIN-NEXT: movq %r9, 8(%rax)
-; LIN-NEXT: movq %r9, 16(%rax)
-; LIN-NEXT: movq %r9, 24(%rax)
+; LIN-NEXT: movq %rdi, 8(%rax)
+; LIN-NEXT: movq %rdi, 16(%rax)
+; LIN-NEXT: movq %rdi, 24(%rax)
; LIN-NEXT: popq %rbx
; LIN-NEXT: retq
%b = sub i256 0, %a
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index 5b4d180140957..d0a1ed34359d7 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -194,7 +194,6 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
define i4 @func4(i4 %x, i4 %y) nounwind {
; X64-LABEL: func4:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbx
; X64-NEXT: shlb $4, %sil
; X64-NEXT: sarb $4, %sil
; X64-NEXT: shlb $4, %dil
@@ -203,21 +202,20 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
; X64-NEXT: movsbl %dil, %ecx
; X64-NEXT: movl %ecx, %eax
; X64-NEXT: idivb %sil
-; X64-NEXT: movsbl %ah, %ebx
+; X64-NEXT: movsbl %ah, %edx
; X64-NEXT: movzbl %al, %edi
; X64-NEXT: leal -1(%rdi), %eax
; X64-NEXT: movzbl %al, %eax
; X64-NEXT: testb %sil, %sil
-; X64-NEXT: sets %dl
+; X64-NEXT: sets %sil
; X64-NEXT: testb %cl, %cl
; X64-NEXT: sets %cl
-; X64-NEXT: xorb %dl, %cl
-; X64-NEXT: testb %bl, %bl
+; X64-NEXT: xorb %sil, %cl
+; X64-NEXT: testb %dl, %dl
; X64-NEXT: setne %dl
; X64-NEXT: testb %cl, %dl
; X64-NEXT: cmovel %edi, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: popq %rbx
; X64-NEXT: retq
;
; X86-LABEL: func4:
@@ -264,29 +262,29 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
; X64-NEXT: pushq %rax
-; X64-NEXT: movq %rsi, %r14
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: movq %rdi, %r14
; X64-NEXT: movq %rdi, %r15
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: sarq $63, %rbx
-; X64-NEXT: shldq $31, %rdi, %rbx
-; X64-NEXT: shlq $31, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: shldq $31, %rdi, %r15
+; X64-NEXT: shlq $31, %r14
; X64-NEXT: movq %rsi, %r12
; X64-NEXT: sarq $63, %r12
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %rbx, %rsi
-; X64-NEXT: movq %r14, %rdx
+; X64-NEXT: movq %r14, %rdi
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: movq %rbx, %rdx
; X64-NEXT: movq %r12, %rcx
; X64-NEXT: callq __divti3 at PLT
; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
; X64-NEXT: leaq -1(%rax), %rbp
-; X64-NEXT: testq %rbx, %rbx
+; X64-NEXT: testq %r15, %r15
; X64-NEXT: sets %al
; X64-NEXT: testq %r12, %r12
; X64-NEXT: sets %r13b
; X64-NEXT: xorb %al, %r13b
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %rbx, %rsi
-; X64-NEXT: movq %r14, %rdx
+; X64-NEXT: movq %r14, %rdi
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: movq %rbx, %rdx
; X64-NEXT: movq %r12, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
@@ -456,37 +454,37 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: cqto
; X64-NEXT: idivq %rcx
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; X64-NEXT: movq %xmm3, %rcx
+; X64-NEXT: movq %xmm3, %rdi
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X64-NEXT: movq %xmm3, %rax
; X64-NEXT: cqto
-; X64-NEXT: idivq %rcx
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: idivq %rdi
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %rdx, %r8
; X64-NEXT: pxor %xmm3, %xmm3
; X64-NEXT: pcmpgtd %xmm4, %xmm3
; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; X64-NEXT: movq %xmm4, %rdi
+; X64-NEXT: movq %xmm4, %r9
; X64-NEXT: pxor %xmm5, %xmm5
; X64-NEXT: pcmpgtd %xmm1, %xmm5
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
; X64-NEXT: psllq $31, %xmm1
; X64-NEXT: movq %xmm1, %rax
; X64-NEXT: cqto
-; X64-NEXT: idivq %rdi
+; X64-NEXT: idivq %r9
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X64-NEXT: movq %xmm4, %rsi
+; X64-NEXT: movq %xmm4, %r11
; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; X64-NEXT: movq %xmm4, %rax
; X64-NEXT: cqto
-; X64-NEXT: idivq %rsi
-; X64-NEXT: movq %r11, %xmm4
-; X64-NEXT: movq %rcx, %xmm5
+; X64-NEXT: idivq %r11
+; X64-NEXT: movq %rsi, %xmm4
+; X64-NEXT: movq %r8, %xmm5
; X64-NEXT: pxor %xmm6, %xmm6
; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
; X64-NEXT: pcmpeqd %xmm6, %xmm4
@@ -498,9 +496,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-NEXT: pxor %xmm2, %xmm2
; X64-NEXT: pcmpgtd %xmm0, %xmm2
-; X64-NEXT: movq %r8, %xmm0
+; X64-NEXT: movq %rcx, %xmm0
; X64-NEXT: pxor %xmm4, %xmm2
-; X64-NEXT: movq %r10, %xmm4
+; X64-NEXT: movq %rdi, %xmm4
; X64-NEXT: pandn %xmm2, %xmm5
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; X64-NEXT: movdqa %xmm5, %xmm2
@@ -509,7 +507,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: paddq %xmm4, %xmm0
; X64-NEXT: pand %xmm5, %xmm0
; X64-NEXT: por %xmm2, %xmm0
-; X64-NEXT: movq %rdi, %xmm2
+; X64-NEXT: movq %r10, %xmm2
; X64-NEXT: movq %rdx, %xmm5
; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
; X64-NEXT: pcmpeqd %xmm6, %xmm2
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index 20ea9c5aeab2b..78e47057546b3 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -228,7 +228,6 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
;
; X64-LABEL: func4:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbx
; X64-NEXT: shlb $4, %sil
; X64-NEXT: sarb $4, %sil
; X64-NEXT: shlb $4, %dil
@@ -237,16 +236,16 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
; X64-NEXT: movsbl %dil, %ecx
; X64-NEXT: movl %ecx, %eax
; X64-NEXT: idivb %sil
-; X64-NEXT: movsbl %ah, %ebx
+; X64-NEXT: movsbl %ah, %edx
; X64-NEXT: movzbl %al, %eax
; X64-NEXT: leal -1(%rax), %edi
; X64-NEXT: movzbl %dil, %edi
; X64-NEXT: testb %sil, %sil
-; X64-NEXT: sets %dl
+; X64-NEXT: sets %sil
; X64-NEXT: testb %cl, %cl
; X64-NEXT: sets %cl
-; X64-NEXT: xorb %dl, %cl
-; X64-NEXT: testb %bl, %bl
+; X64-NEXT: xorb %sil, %cl
+; X64-NEXT: testb %dl, %dl
; X64-NEXT: setne %dl
; X64-NEXT: testb %cl, %dl
; X64-NEXT: cmovel %eax, %edi
@@ -257,7 +256,6 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
; X64-NEXT: movl $248, %eax
; X64-NEXT: cmovgel %ecx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: popq %rbx
; X64-NEXT: retq
;
; X86-LABEL: func4:
@@ -590,10 +588,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm0, %r15
-; X64-NEXT: movq %r15, %rbp
+; X64-NEXT: movq %xmm0, %rbx
+; X64-NEXT: movq %rbx, %rbp
; X64-NEXT: sarq $63, %rbp
-; X64-NEXT: shldq $31, %r15, %rbp
+; X64-NEXT: shldq $31, %rbx, %rbp
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pcmpgtd %xmm1, %xmm0
@@ -601,13 +599,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movq %xmm1, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: sarq $63, %rbx
-; X64-NEXT: movq %r15, %r12
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
; X64-NEXT: shlq $31, %r12
; X64-NEXT: movq %r12, %rdi
; X64-NEXT: movq %rbp, %rsi
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -615,16 +613,16 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: subq $1, %r13
; X64-NEXT: sbbq $0, %r14
-; X64-NEXT: shrq $63, %r15
-; X64-NEXT: xorl %ebx, %r15d
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %r15d, %ebx
; X64-NEXT: movq %r12, %rdi
; X64-NEXT: movq %rbp, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: testb %r15b, %al
+; X64-NEXT: testb %bl, %al
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF
@@ -649,57 +647,57 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rbx
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movq %rbx, %rbp
+; X64-NEXT: sarq $63, %rbp
+; X64-NEXT: shldq $31, %rbx, %rbp
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: sarq $63, %rbp
-; X64-NEXT: movq %rbx, %r15
-; X64-NEXT: shlq $31, %r15
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %rbp, %rsi
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r12
+; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r12
+; X64-NEXT: subq $1, %r13
; X64-NEXT: sbbq $0, %r14
; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %ebp, %ebx
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %rbp, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
; X64-NEXT: testb %bl, %al
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %r12
+; X64-NEXT: cmpq %rcx, %r13
; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
-; X64-NEXT: cmovbq %r12, %rax
+; X64-NEXT: cmovbq %r13, %rax
; X64-NEXT: testq %r14, %r14
-; X64-NEXT: cmovnsq %rcx, %r12
-; X64-NEXT: cmoveq %rax, %r12
+; X64-NEXT: cmovnsq %rcx, %r13
+; X64-NEXT: cmoveq %rax, %r13
; X64-NEXT: movl $0, %eax
; X64-NEXT: cmovnsq %rax, %r14
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rcx, %r12
+; X64-NEXT: cmpq %rcx, %r13
; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: cmovaq %r12, %rax
+; X64-NEXT: cmovaq %r13, %rax
; X64-NEXT: testq %r14, %r14
-; X64-NEXT: cmovsq %rcx, %r12
+; X64-NEXT: cmovsq %rcx, %r13
; X64-NEXT: cmpq $-1, %r14
-; X64-NEXT: cmoveq %rax, %r12
-; X64-NEXT: movq %r12, %xmm0
+; X64-NEXT: cmoveq %rax, %r13
+; X64-NEXT: movq %r13, %xmm0
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-NEXT: psrlq $1, %xmm1
@@ -715,9 +713,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movq %xmm0, %rbx
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movq %rbx, %rbp
+; X64-NEXT: sarq $63, %rbp
+; X64-NEXT: shldq $31, %rbx, %rbp
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: pcmpgtd %xmm0, %xmm1
@@ -725,105 +723,105 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: sarq $63, %rbp
-; X64-NEXT: movq %rbx, %r15
-; X64-NEXT: shlq $31, %r15
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %rbp, %rsi
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r12
+; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r12
+; X64-NEXT: subq $1, %r13
; X64-NEXT: sbbq $0, %r14
; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %ebp, %ebx
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %rbp, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
; X64-NEXT: testb %bl, %al
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %r12
+; X64-NEXT: cmpq %rcx, %r13
; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
-; X64-NEXT: cmovbq %r12, %rax
+; X64-NEXT: cmovbq %r13, %rax
; X64-NEXT: testq %r14, %r14
-; X64-NEXT: cmovnsq %rcx, %r12
-; X64-NEXT: cmoveq %rax, %r12
+; X64-NEXT: cmovnsq %rcx, %r13
+; X64-NEXT: cmoveq %rax, %r13
; X64-NEXT: movl $0, %eax
; X64-NEXT: cmovnsq %rax, %r14
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rcx, %r12
+; X64-NEXT: cmpq %rcx, %r13
; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: cmovaq %r12, %rax
+; X64-NEXT: cmovaq %r13, %rax
; X64-NEXT: testq %r14, %r14
-; X64-NEXT: cmovsq %rcx, %r12
+; X64-NEXT: cmovsq %rcx, %r13
; X64-NEXT: cmpq $-1, %r14
-; X64-NEXT: cmoveq %rax, %r12
-; X64-NEXT: movq %r12, %xmm0
+; X64-NEXT: cmoveq %rax, %r13
+; X64-NEXT: movq %r13, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rbx
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movq %rbx, %rbp
+; X64-NEXT: sarq $63, %rbp
+; X64-NEXT: shldq $31, %rbx, %rbp
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: sarq $63, %rbp
-; X64-NEXT: movq %rbx, %r15
-; X64-NEXT: shlq $31, %r15
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %rbp, %rsi
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r12
+; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r12
+; X64-NEXT: subq $1, %r13
; X64-NEXT: sbbq $0, %r14
; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %ebp, %ebx
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %rbp, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
; X64-NEXT: testb %bl, %al
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %r12
+; X64-NEXT: cmpq %rcx, %r13
; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
-; X64-NEXT: cmovbq %r12, %rax
+; X64-NEXT: cmovbq %r13, %rax
; X64-NEXT: testq %r14, %r14
-; X64-NEXT: cmovnsq %rcx, %r12
-; X64-NEXT: cmoveq %rax, %r12
+; X64-NEXT: cmovnsq %rcx, %r13
+; X64-NEXT: cmoveq %rax, %r13
; X64-NEXT: movl $0, %eax
; X64-NEXT: cmovnsq %rax, %r14
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rcx, %r12
+; X64-NEXT: cmpq %rcx, %r13
; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: cmovaq %r12, %rax
+; X64-NEXT: cmovaq %r13, %rax
; X64-NEXT: testq %r14, %r14
-; X64-NEXT: cmovsq %rcx, %r12
+; X64-NEXT: cmovsq %rcx, %r13
; X64-NEXT: cmpq $-1, %r14
-; X64-NEXT: cmoveq %rax, %r12
-; X64-NEXT: movq %r12, %xmm1
+; X64-NEXT: cmoveq %rax, %r13
+; X64-NEXT: movq %r13, %xmm1
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: psrlq $1, %xmm0
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index dbee6bfda0a16..f67d8e9aab27b 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -80,21 +80,21 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm4, %rcx
; SSE2-NEXT: movq %xmm0, %rdx
-; SSE2-NEXT: movq %xmm1, %r8
+; SSE2-NEXT: movq %xmm1, %rsi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdi
; SSE2-NEXT: xorq %rax, %rdi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rsi
-; SSE2-NEXT: xorq %rcx, %rsi
-; SSE2-NEXT: orq %rdi, %rsi
+; SSE2-NEXT: movq %xmm0, %r8
+; SSE2-NEXT: xorq %rcx, %r8
+; SSE2-NEXT: orq %rdi, %r8
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: xorq %rdx, %rax
; SSE2-NEXT: movq %xmm3, %rcx
-; SSE2-NEXT: xorq %r8, %rcx
+; SSE2-NEXT: xorq %rsi, %rcx
; SSE2-NEXT: orq %rax, %rcx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rsi, %rcx
+; SSE2-NEXT: orq %r8, %rcx
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
@@ -103,19 +103,19 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE41-NEXT: movq %xmm0, %rax
; SSE41-NEXT: movq %xmm1, %rcx
; SSE41-NEXT: pextrq $1, %xmm0, %rdx
-; SSE41-NEXT: pextrq $1, %xmm1, %r8
+; SSE41-NEXT: pextrq $1, %xmm1, %rsi
; SSE41-NEXT: movq %xmm2, %rdi
; SSE41-NEXT: xorq %rax, %rdi
-; SSE41-NEXT: movq %xmm3, %rsi
-; SSE41-NEXT: xorq %rcx, %rsi
-; SSE41-NEXT: orq %rdi, %rsi
+; SSE41-NEXT: movq %xmm3, %r8
+; SSE41-NEXT: xorq %rcx, %r8
+; SSE41-NEXT: orq %rdi, %r8
; SSE41-NEXT: pextrq $1, %xmm2, %rax
; SSE41-NEXT: xorq %rdx, %rax
; SSE41-NEXT: pextrq $1, %xmm3, %rcx
-; SSE41-NEXT: xorq %r8, %rcx
+; SSE41-NEXT: xorq %rsi, %rcx
; SSE41-NEXT: orq %rax, %rcx
; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: orq %rsi, %rcx
+; SSE41-NEXT: orq %r8, %rcx
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
@@ -160,21 +160,21 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm4, %rcx
; SSE2-NEXT: movq %xmm0, %rdx
-; SSE2-NEXT: movq %xmm1, %r8
+; SSE2-NEXT: movq %xmm1, %rsi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdi
; SSE2-NEXT: xorq %rax, %rdi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rsi
-; SSE2-NEXT: xorq %rcx, %rsi
-; SSE2-NEXT: orq %rdi, %rsi
+; SSE2-NEXT: movq %xmm0, %r8
+; SSE2-NEXT: xorq %rcx, %r8
+; SSE2-NEXT: orq %rdi, %r8
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: xorq %rdx, %rax
; SSE2-NEXT: movq %xmm3, %rcx
-; SSE2-NEXT: xorq %r8, %rcx
+; SSE2-NEXT: xorq %rsi, %rcx
; SSE2-NEXT: orq %rax, %rcx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rsi, %rcx
+; SSE2-NEXT: orq %r8, %rcx
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
@@ -183,19 +183,19 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE41-NEXT: movq %xmm0, %rax
; SSE41-NEXT: movq %xmm1, %rcx
; SSE41-NEXT: pextrq $1, %xmm0, %rdx
-; SSE41-NEXT: pextrq $1, %xmm1, %r8
+; SSE41-NEXT: pextrq $1, %xmm1, %rsi
; SSE41-NEXT: movq %xmm2, %rdi
; SSE41-NEXT: xorq %rax, %rdi
-; SSE41-NEXT: movq %xmm3, %rsi
-; SSE41-NEXT: xorq %rcx, %rsi
-; SSE41-NEXT: orq %rdi, %rsi
+; SSE41-NEXT: movq %xmm3, %r8
+; SSE41-NEXT: xorq %rcx, %r8
+; SSE41-NEXT: orq %rdi, %r8
; SSE41-NEXT: pextrq $1, %xmm2, %rax
; SSE41-NEXT: xorq %rdx, %rax
; SSE41-NEXT: pextrq $1, %xmm3, %rcx
-; SSE41-NEXT: xorq %r8, %rcx
+; SSE41-NEXT: xorq %rsi, %rcx
; SSE41-NEXT: orq %rax, %rcx
; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: orq %rsi, %rcx
+; SSE41-NEXT: orq %r8, %rcx
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
@@ -242,14 +242,14 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm8, %rdi
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
-; SSE2-NEXT: movq %xmm8, %rax
-; SSE2-NEXT: movq %xmm0, %r11
+; SSE2-NEXT: movq %xmm8, %r8
+; SSE2-NEXT: movq %xmm0, %r9
; SSE2-NEXT: movq %xmm2, %r10
-; SSE2-NEXT: movq %xmm1, %r9
-; SSE2-NEXT: movq %xmm3, %r8
+; SSE2-NEXT: movq %xmm1, %rcx
+; SSE2-NEXT: movq %xmm3, %rax
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: xorq %rdx, %rcx
+; SSE2-NEXT: movq %xmm0, %r11
+; SSE2-NEXT: xorq %rdx, %r11
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdx
; SSE2-NEXT: xorq %rsi, %rdx
@@ -258,23 +258,23 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
; SSE2-NEXT: xorq %rdi, %rsi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdi
-; SSE2-NEXT: xorq %rax, %rdi
+; SSE2-NEXT: xorq %r8, %rdi
; SSE2-NEXT: orq %rsi, %rdi
; SSE2-NEXT: orq %rdx, %rdi
-; SSE2-NEXT: orq %rcx, %rdi
-; SSE2-NEXT: movq %xmm4, %rax
-; SSE2-NEXT: xorq %r11, %rax
-; SSE2-NEXT: movq %xmm6, %rcx
-; SSE2-NEXT: xorq %r10, %rcx
-; SSE2-NEXT: movq %xmm5, %rdx
+; SSE2-NEXT: orq %r11, %rdi
+; SSE2-NEXT: movq %xmm4, %rdx
; SSE2-NEXT: xorq %r9, %rdx
-; SSE2-NEXT: movq %xmm7, %rsi
-; SSE2-NEXT: xorq %r8, %rsi
-; SSE2-NEXT: orq %rdx, %rsi
-; SSE2-NEXT: orq %rcx, %rsi
-; SSE2-NEXT: orq %rax, %rsi
+; SSE2-NEXT: movq %xmm6, %rsi
+; SSE2-NEXT: xorq %r10, %rsi
+; SSE2-NEXT: movq %xmm5, %r8
+; SSE2-NEXT: xorq %rcx, %r8
+; SSE2-NEXT: movq %xmm7, %rcx
+; SSE2-NEXT: xorq %rax, %rcx
+; SSE2-NEXT: orq %r8, %rcx
+; SSE2-NEXT: orq %rsi, %rcx
+; SSE2-NEXT: orq %rdx, %rcx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rdi, %rsi
+; SSE2-NEXT: orq %rdi, %rcx
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
@@ -284,12 +284,12 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
; SSE41-NEXT: movq %xmm2, %rcx
; SSE41-NEXT: movq %xmm1, %rdx
; SSE41-NEXT: movq %xmm3, %rsi
-; SSE41-NEXT: pextrq $1, %xmm0, %r11
+; SSE41-NEXT: pextrq $1, %xmm0, %rdi
; SSE41-NEXT: pextrq $1, %xmm2, %r8
; SSE41-NEXT: pextrq $1, %xmm1, %r9
; SSE41-NEXT: pextrq $1, %xmm3, %r10
-; SSE41-NEXT: movq %xmm4, %rdi
-; SSE41-NEXT: xorq %rax, %rdi
+; SSE41-NEXT: movq %xmm4, %r11
+; SSE41-NEXT: xorq %rax, %r11
; SSE41-NEXT: movq %xmm6, %rax
; SSE41-NEXT: xorq %rcx, %rax
; SSE41-NEXT: movq %xmm5, %rcx
@@ -298,9 +298,9 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
; SSE41-NEXT: xorq %rsi, %rdx
; SSE41-NEXT: orq %rcx, %rdx
; SSE41-NEXT: orq %rax, %rdx
-; SSE41-NEXT: orq %rdi, %rdx
+; SSE41-NEXT: orq %r11, %rdx
; SSE41-NEXT: pextrq $1, %xmm4, %rax
-; SSE41-NEXT: xorq %r11, %rax
+; SSE41-NEXT: xorq %rdi, %rax
; SSE41-NEXT: pextrq $1, %xmm6, %rcx
; SSE41-NEXT: xorq %r8, %rcx
; SSE41-NEXT: pextrq $1, %xmm5, %rsi
@@ -322,13 +322,13 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vmovq %xmm4, %rdi
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vmovq %xmm5, %rax
-; AVX1-NEXT: vpextrq $1, %xmm0, %r11
+; AVX1-NEXT: vmovq %xmm5, %r8
+; AVX1-NEXT: vpextrq $1, %xmm0, %r9
; AVX1-NEXT: vpextrq $1, %xmm1, %r10
-; AVX1-NEXT: vpextrq $1, %xmm4, %r9
-; AVX1-NEXT: vpextrq $1, %xmm5, %r8
-; AVX1-NEXT: vmovq %xmm2, %rcx
-; AVX1-NEXT: xorq %rdx, %rcx
+; AVX1-NEXT: vpextrq $1, %xmm4, %rcx
+; AVX1-NEXT: vpextrq $1, %xmm5, %rax
+; AVX1-NEXT: vmovq %xmm2, %r11
+; AVX1-NEXT: xorq %rdx, %r11
; AVX1-NEXT: vmovq %xmm3, %rdx
; AVX1-NEXT: xorq %rsi, %rdx
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
@@ -336,23 +336,23 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
; AVX1-NEXT: xorq %rdi, %rsi
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
; AVX1-NEXT: vmovq %xmm1, %rdi
-; AVX1-NEXT: xorq %rax, %rdi
+; AVX1-NEXT: xorq %r8, %rdi
; AVX1-NEXT: orq %rsi, %rdi
; AVX1-NEXT: orq %rdx, %rdi
-; AVX1-NEXT: orq %rcx, %rdi
-; AVX1-NEXT: vpextrq $1, %xmm2, %rax
-; AVX1-NEXT: xorq %r11, %rax
-; AVX1-NEXT: vpextrq $1, %xmm3, %rcx
-; AVX1-NEXT: xorq %r10, %rcx
-; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT: orq %r11, %rdi
+; AVX1-NEXT: vpextrq $1, %xmm2, %rdx
; AVX1-NEXT: xorq %r9, %rdx
-; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX1-NEXT: xorq %r8, %rsi
-; AVX1-NEXT: orq %rdx, %rsi
-; AVX1-NEXT: orq %rcx, %rsi
-; AVX1-NEXT: orq %rax, %rsi
+; AVX1-NEXT: vpextrq $1, %xmm3, %rsi
+; AVX1-NEXT: xorq %r10, %rsi
+; AVX1-NEXT: vpextrq $1, %xmm0, %r8
+; AVX1-NEXT: xorq %rcx, %r8
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: xorq %rax, %rcx
+; AVX1-NEXT: orq %r8, %rcx
+; AVX1-NEXT: orq %rsi, %rcx
+; AVX1-NEXT: orq %rdx, %rcx
; AVX1-NEXT: xorl %eax, %eax
-; AVX1-NEXT: orq %rdi, %rsi
+; AVX1-NEXT: orq %rdi, %rcx
; AVX1-NEXT: setne %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -364,13 +364,13 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vmovq %xmm4, %rdi
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT: vmovq %xmm5, %rax
-; AVX2-NEXT: vpextrq $1, %xmm0, %r11
+; AVX2-NEXT: vmovq %xmm5, %r8
+; AVX2-NEXT: vpextrq $1, %xmm0, %r9
; AVX2-NEXT: vpextrq $1, %xmm1, %r10
-; AVX2-NEXT: vpextrq $1, %xmm4, %r9
-; AVX2-NEXT: vpextrq $1, %xmm5, %r8
-; AVX2-NEXT: vmovq %xmm2, %rcx
-; AVX2-NEXT: xorq %rdx, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm4, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm5, %rax
+; AVX2-NEXT: vmovq %xmm2, %r11
+; AVX2-NEXT: xorq %rdx, %r11
; AVX2-NEXT: vmovq %xmm3, %rdx
; AVX2-NEXT: xorq %rsi, %rdx
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
@@ -378,23 +378,23 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
; AVX2-NEXT: xorq %rdi, %rsi
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1
; AVX2-NEXT: vmovq %xmm1, %rdi
-; AVX2-NEXT: xorq %rax, %rdi
+; AVX2-NEXT: xorq %r8, %rdi
; AVX2-NEXT: orq %rsi, %rdi
; AVX2-NEXT: orq %rdx, %rdi
-; AVX2-NEXT: orq %rcx, %rdi
-; AVX2-NEXT: vpextrq $1, %xmm2, %rax
-; AVX2-NEXT: xorq %r11, %rax
-; AVX2-NEXT: vpextrq $1, %xmm3, %rcx
-; AVX2-NEXT: xorq %r10, %rcx
-; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: orq %r11, %rdi
+; AVX2-NEXT: vpextrq $1, %xmm2, %rdx
; AVX2-NEXT: xorq %r9, %rdx
-; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX2-NEXT: xorq %r8, %rsi
-; AVX2-NEXT: orq %rdx, %rsi
-; AVX2-NEXT: orq %rcx, %rsi
-; AVX2-NEXT: orq %rax, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm3, %rsi
+; AVX2-NEXT: xorq %r10, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm0, %r8
+; AVX2-NEXT: xorq %rcx, %r8
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: xorq %rax, %rcx
+; AVX2-NEXT: orq %r8, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: orq %rdi, %rsi
+; AVX2-NEXT: orq %rdi, %rcx
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -433,14 +433,14 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3]
; SSE2-NEXT: movq %xmm8, %rdi
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
-; SSE2-NEXT: movq %xmm8, %rax
-; SSE2-NEXT: movq %xmm0, %r11
+; SSE2-NEXT: movq %xmm8, %r8
+; SSE2-NEXT: movq %xmm0, %r9
; SSE2-NEXT: movq %xmm2, %r10
-; SSE2-NEXT: movq %xmm1, %r9
-; SSE2-NEXT: movq %xmm3, %r8
+; SSE2-NEXT: movq %xmm1, %rcx
+; SSE2-NEXT: movq %xmm3, %rax
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: xorq %rdx, %rcx
+; SSE2-NEXT: movq %xmm0, %r11
+; SSE2-NEXT: xorq %rdx, %r11
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdx
; SSE2-NEXT: xorq %rsi, %rdx
@@ -449,23 +449,23 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
; SSE2-NEXT: xorq %rdi, %rsi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rdi
-; SSE2-NEXT: xorq %rax, %rdi
+; SSE2-NEXT: xorq %r8, %rdi
; SSE2-NEXT: orq %rsi, %rdi
; SSE2-NEXT: orq %rdx, %rdi
-; SSE2-NEXT: orq %rcx, %rdi
-; SSE2-NEXT: movq %xmm4, %rax
-; SSE2-NEXT: xorq %r11, %rax
-; SSE2-NEXT: movq %xmm6, %rcx
-; SSE2-NEXT: xorq %r10, %rcx
-; SSE2-NEXT: movq %xmm5, %rdx
+; SSE2-NEXT: orq %r11, %rdi
+; SSE2-NEXT: movq %xmm4, %rdx
; SSE2-NEXT: xorq %r9, %rdx
-; SSE2-NEXT: movq %xmm7, %rsi
-; SSE2-NEXT: xorq %r8, %rsi
-; SSE2-NEXT: orq %rdx, %rsi
-; SSE2-NEXT: orq %rcx, %rsi
-; SSE2-NEXT: orq %rax, %rsi
+; SSE2-NEXT: movq %xmm6, %rsi
+; SSE2-NEXT: xorq %r10, %rsi
+; SSE2-NEXT: movq %xmm5, %r8
+; SSE2-NEXT: xorq %rcx, %r8
+; SSE2-NEXT: movq %xmm7, %rcx
+; SSE2-NEXT: xorq %rax, %rcx
+; SSE2-NEXT: orq %r8, %rcx
+; SSE2-NEXT: orq %rsi, %rcx
+; SSE2-NEXT: orq %rdx, %rcx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rdi, %rsi
+; SSE2-NEXT: orq %rdi, %rcx
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
@@ -475,12 +475,12 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
; SSE41-NEXT: movq %xmm2, %rcx
; SSE41-NEXT: movq %xmm1, %rdx
; SSE41-NEXT: movq %xmm3, %rsi
-; SSE41-NEXT: pextrq $1, %xmm0, %r11
+; SSE41-NEXT: pextrq $1, %xmm0, %rdi
; SSE41-NEXT: pextrq $1, %xmm2, %r8
; SSE41-NEXT: pextrq $1, %xmm1, %r9
; SSE41-NEXT: pextrq $1, %xmm3, %r10
-; SSE41-NEXT: movq %xmm4, %rdi
-; SSE41-NEXT: xorq %rax, %rdi
+; SSE41-NEXT: movq %xmm4, %r11
+; SSE41-NEXT: xorq %rax, %r11
; SSE41-NEXT: movq %xmm6, %rax
; SSE41-NEXT: xorq %rcx, %rax
; SSE41-NEXT: movq %xmm5, %rcx
@@ -489,9 +489,9 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
; SSE41-NEXT: xorq %rsi, %rdx
; SSE41-NEXT: orq %rcx, %rdx
; SSE41-NEXT: orq %rax, %rdx
-; SSE41-NEXT: orq %rdi, %rdx
+; SSE41-NEXT: orq %r11, %rdx
; SSE41-NEXT: pextrq $1, %xmm4, %rax
-; SSE41-NEXT: xorq %r11, %rax
+; SSE41-NEXT: xorq %rdi, %rax
; SSE41-NEXT: pextrq $1, %xmm6, %rcx
; SSE41-NEXT: xorq %r8, %rcx
; SSE41-NEXT: pextrq $1, %xmm5, %rsi
@@ -513,13 +513,13 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vmovq %xmm4, %rdi
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vmovq %xmm5, %rax
-; AVX1-NEXT: vpextrq $1, %xmm0, %r11
+; AVX1-NEXT: vmovq %xmm5, %r8
+; AVX1-NEXT: vpextrq $1, %xmm0, %r9
; AVX1-NEXT: vpextrq $1, %xmm1, %r10
-; AVX1-NEXT: vpextrq $1, %xmm4, %r9
-; AVX1-NEXT: vpextrq $1, %xmm5, %r8
-; AVX1-NEXT: vmovq %xmm2, %rcx
-; AVX1-NEXT: xorq %rdx, %rcx
+; AVX1-NEXT: vpextrq $1, %xmm4, %rcx
+; AVX1-NEXT: vpextrq $1, %xmm5, %rax
+; AVX1-NEXT: vmovq %xmm2, %r11
+; AVX1-NEXT: xorq %rdx, %r11
; AVX1-NEXT: vmovq %xmm3, %rdx
; AVX1-NEXT: xorq %rsi, %rdx
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
@@ -527,23 +527,23 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
; AVX1-NEXT: xorq %rdi, %rsi
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
; AVX1-NEXT: vmovq %xmm1, %rdi
-; AVX1-NEXT: xorq %rax, %rdi
+; AVX1-NEXT: xorq %r8, %rdi
; AVX1-NEXT: orq %rsi, %rdi
; AVX1-NEXT: orq %rdx, %rdi
-; AVX1-NEXT: orq %rcx, %rdi
-; AVX1-NEXT: vpextrq $1, %xmm2, %rax
-; AVX1-NEXT: xorq %r11, %rax
-; AVX1-NEXT: vpextrq $1, %xmm3, %rcx
-; AVX1-NEXT: xorq %r10, %rcx
-; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT: orq %r11, %rdi
+; AVX1-NEXT: vpextrq $1, %xmm2, %rdx
; AVX1-NEXT: xorq %r9, %rdx
-; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX1-NEXT: xorq %r8, %rsi
-; AVX1-NEXT: orq %rdx, %rsi
-; AVX1-NEXT: orq %rcx, %rsi
-; AVX1-NEXT: orq %rax, %rsi
+; AVX1-NEXT: vpextrq $1, %xmm3, %rsi
+; AVX1-NEXT: xorq %r10, %rsi
+; AVX1-NEXT: vpextrq $1, %xmm0, %r8
+; AVX1-NEXT: xorq %rcx, %r8
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: xorq %rax, %rcx
+; AVX1-NEXT: orq %r8, %rcx
+; AVX1-NEXT: orq %rsi, %rcx
+; AVX1-NEXT: orq %rdx, %rcx
; AVX1-NEXT: xorl %eax, %eax
-; AVX1-NEXT: orq %rdi, %rsi
+; AVX1-NEXT: orq %rdi, %rcx
; AVX1-NEXT: sete %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -555,13 +555,13 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vmovq %xmm4, %rdi
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT: vmovq %xmm5, %rax
-; AVX2-NEXT: vpextrq $1, %xmm0, %r11
+; AVX2-NEXT: vmovq %xmm5, %r8
+; AVX2-NEXT: vpextrq $1, %xmm0, %r9
; AVX2-NEXT: vpextrq $1, %xmm1, %r10
-; AVX2-NEXT: vpextrq $1, %xmm4, %r9
-; AVX2-NEXT: vpextrq $1, %xmm5, %r8
-; AVX2-NEXT: vmovq %xmm2, %rcx
-; AVX2-NEXT: xorq %rdx, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm4, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm5, %rax
+; AVX2-NEXT: vmovq %xmm2, %r11
+; AVX2-NEXT: xorq %rdx, %r11
; AVX2-NEXT: vmovq %xmm3, %rdx
; AVX2-NEXT: xorq %rsi, %rdx
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
@@ -569,23 +569,23 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
; AVX2-NEXT: xorq %rdi, %rsi
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1
; AVX2-NEXT: vmovq %xmm1, %rdi
-; AVX2-NEXT: xorq %rax, %rdi
+; AVX2-NEXT: xorq %r8, %rdi
; AVX2-NEXT: orq %rsi, %rdi
; AVX2-NEXT: orq %rdx, %rdi
-; AVX2-NEXT: orq %rcx, %rdi
-; AVX2-NEXT: vpextrq $1, %xmm2, %rax
-; AVX2-NEXT: xorq %r11, %rax
-; AVX2-NEXT: vpextrq $1, %xmm3, %rcx
-; AVX2-NEXT: xorq %r10, %rcx
-; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: orq %r11, %rdi
+; AVX2-NEXT: vpextrq $1, %xmm2, %rdx
; AVX2-NEXT: xorq %r9, %rdx
-; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX2-NEXT: xorq %r8, %rsi
-; AVX2-NEXT: orq %rdx, %rsi
-; AVX2-NEXT: orq %rcx, %rsi
-; AVX2-NEXT: orq %rax, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm3, %rsi
+; AVX2-NEXT: xorq %r10, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm0, %r8
+; AVX2-NEXT: xorq %rcx, %r8
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: xorq %rax, %rcx
+; AVX2-NEXT: orq %r8, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: orq %rdi, %rsi
+; AVX2-NEXT: orq %rdi, %rcx
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -736,59 +736,59 @@ define i32 @eq_i128_pair(ptr %a, ptr %b) {
define i32 @ne_i256_pair(ptr %a, ptr %b) {
; SSE2-LABEL: ne_i256_pair:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq 16(%rdi), %r9
-; SSE2-NEXT: movq 24(%rdi), %r11
-; SSE2-NEXT: movq (%rdi), %r8
-; SSE2-NEXT: movq 8(%rdi), %r10
-; SSE2-NEXT: xorq 8(%rsi), %r10
-; SSE2-NEXT: xorq 24(%rsi), %r11
-; SSE2-NEXT: xorq (%rsi), %r8
-; SSE2-NEXT: xorq 16(%rsi), %r9
-; SSE2-NEXT: movq 48(%rdi), %rcx
-; SSE2-NEXT: movq 32(%rdi), %rax
-; SSE2-NEXT: movq 56(%rdi), %rdx
+; SSE2-NEXT: movq 16(%rdi), %rcx
+; SSE2-NEXT: movq 24(%rdi), %rdx
+; SSE2-NEXT: movq (%rdi), %rax
+; SSE2-NEXT: movq 8(%rdi), %r8
+; SSE2-NEXT: xorq 8(%rsi), %r8
+; SSE2-NEXT: xorq 24(%rsi), %rdx
+; SSE2-NEXT: xorq (%rsi), %rax
+; SSE2-NEXT: xorq 16(%rsi), %rcx
+; SSE2-NEXT: movq 48(%rdi), %r9
+; SSE2-NEXT: movq 32(%rdi), %r10
+; SSE2-NEXT: movq 56(%rdi), %r11
; SSE2-NEXT: movq 40(%rdi), %rdi
; SSE2-NEXT: xorq 40(%rsi), %rdi
-; SSE2-NEXT: xorq 56(%rsi), %rdx
-; SSE2-NEXT: orq %r11, %rdx
-; SSE2-NEXT: orq %rdi, %rdx
-; SSE2-NEXT: orq %r10, %rdx
-; SSE2-NEXT: xorq 32(%rsi), %rax
-; SSE2-NEXT: xorq 48(%rsi), %rcx
-; SSE2-NEXT: orq %r9, %rcx
-; SSE2-NEXT: orq %rax, %rcx
-; SSE2-NEXT: orq %r8, %rcx
+; SSE2-NEXT: xorq 56(%rsi), %r11
+; SSE2-NEXT: orq %rdx, %r11
+; SSE2-NEXT: orq %rdi, %r11
+; SSE2-NEXT: orq %r8, %r11
+; SSE2-NEXT: xorq 32(%rsi), %r10
+; SSE2-NEXT: xorq 48(%rsi), %r9
+; SSE2-NEXT: orq %rcx, %r9
+; SSE2-NEXT: orq %r10, %r9
+; SSE2-NEXT: orq %rax, %r9
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rdx, %rcx
+; SSE2-NEXT: orq %r11, %r9
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
; SSE41-LABEL: ne_i256_pair:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq 16(%rdi), %r9
-; SSE41-NEXT: movq 24(%rdi), %r11
-; SSE41-NEXT: movq (%rdi), %r8
-; SSE41-NEXT: movq 8(%rdi), %r10
-; SSE41-NEXT: xorq 8(%rsi), %r10
-; SSE41-NEXT: xorq 24(%rsi), %r11
-; SSE41-NEXT: xorq (%rsi), %r8
-; SSE41-NEXT: xorq 16(%rsi), %r9
-; SSE41-NEXT: movq 48(%rdi), %rcx
-; SSE41-NEXT: movq 32(%rdi), %rax
-; SSE41-NEXT: movq 56(%rdi), %rdx
+; SSE41-NEXT: movq 16(%rdi), %rcx
+; SSE41-NEXT: movq 24(%rdi), %rdx
+; SSE41-NEXT: movq (%rdi), %rax
+; SSE41-NEXT: movq 8(%rdi), %r8
+; SSE41-NEXT: xorq 8(%rsi), %r8
+; SSE41-NEXT: xorq 24(%rsi), %rdx
+; SSE41-NEXT: xorq (%rsi), %rax
+; SSE41-NEXT: xorq 16(%rsi), %rcx
+; SSE41-NEXT: movq 48(%rdi), %r9
+; SSE41-NEXT: movq 32(%rdi), %r10
+; SSE41-NEXT: movq 56(%rdi), %r11
; SSE41-NEXT: movq 40(%rdi), %rdi
; SSE41-NEXT: xorq 40(%rsi), %rdi
-; SSE41-NEXT: xorq 56(%rsi), %rdx
-; SSE41-NEXT: orq %r11, %rdx
-; SSE41-NEXT: orq %rdi, %rdx
-; SSE41-NEXT: orq %r10, %rdx
-; SSE41-NEXT: xorq 32(%rsi), %rax
-; SSE41-NEXT: xorq 48(%rsi), %rcx
-; SSE41-NEXT: orq %r9, %rcx
-; SSE41-NEXT: orq %rax, %rcx
-; SSE41-NEXT: orq %r8, %rcx
+; SSE41-NEXT: xorq 56(%rsi), %r11
+; SSE41-NEXT: orq %rdx, %r11
+; SSE41-NEXT: orq %rdi, %r11
+; SSE41-NEXT: orq %r8, %r11
+; SSE41-NEXT: xorq 32(%rsi), %r10
+; SSE41-NEXT: xorq 48(%rsi), %r9
+; SSE41-NEXT: orq %rcx, %r9
+; SSE41-NEXT: orq %r10, %r9
+; SSE41-NEXT: orq %rax, %r9
; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: orq %rdx, %rcx
+; SSE41-NEXT: orq %r11, %r9
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
@@ -850,59 +850,59 @@ define i32 @ne_i256_pair(ptr %a, ptr %b) {
define i32 @eq_i256_pair(ptr %a, ptr %b) {
; SSE2-LABEL: eq_i256_pair:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq 16(%rdi), %r9
-; SSE2-NEXT: movq 24(%rdi), %r11
-; SSE2-NEXT: movq (%rdi), %r8
-; SSE2-NEXT: movq 8(%rdi), %r10
-; SSE2-NEXT: xorq 8(%rsi), %r10
-; SSE2-NEXT: xorq 24(%rsi), %r11
-; SSE2-NEXT: xorq (%rsi), %r8
-; SSE2-NEXT: xorq 16(%rsi), %r9
-; SSE2-NEXT: movq 48(%rdi), %rcx
-; SSE2-NEXT: movq 32(%rdi), %rax
-; SSE2-NEXT: movq 56(%rdi), %rdx
+; SSE2-NEXT: movq 16(%rdi), %rcx
+; SSE2-NEXT: movq 24(%rdi), %rdx
+; SSE2-NEXT: movq (%rdi), %rax
+; SSE2-NEXT: movq 8(%rdi), %r8
+; SSE2-NEXT: xorq 8(%rsi), %r8
+; SSE2-NEXT: xorq 24(%rsi), %rdx
+; SSE2-NEXT: xorq (%rsi), %rax
+; SSE2-NEXT: xorq 16(%rsi), %rcx
+; SSE2-NEXT: movq 48(%rdi), %r9
+; SSE2-NEXT: movq 32(%rdi), %r10
+; SSE2-NEXT: movq 56(%rdi), %r11
; SSE2-NEXT: movq 40(%rdi), %rdi
; SSE2-NEXT: xorq 40(%rsi), %rdi
-; SSE2-NEXT: xorq 56(%rsi), %rdx
-; SSE2-NEXT: orq %r11, %rdx
-; SSE2-NEXT: orq %rdi, %rdx
-; SSE2-NEXT: orq %r10, %rdx
-; SSE2-NEXT: xorq 32(%rsi), %rax
-; SSE2-NEXT: xorq 48(%rsi), %rcx
-; SSE2-NEXT: orq %r9, %rcx
-; SSE2-NEXT: orq %rax, %rcx
-; SSE2-NEXT: orq %r8, %rcx
+; SSE2-NEXT: xorq 56(%rsi), %r11
+; SSE2-NEXT: orq %rdx, %r11
+; SSE2-NEXT: orq %rdi, %r11
+; SSE2-NEXT: orq %r8, %r11
+; SSE2-NEXT: xorq 32(%rsi), %r10
+; SSE2-NEXT: xorq 48(%rsi), %r9
+; SSE2-NEXT: orq %rcx, %r9
+; SSE2-NEXT: orq %r10, %r9
+; SSE2-NEXT: orq %rax, %r9
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rdx, %rcx
+; SSE2-NEXT: orq %r11, %r9
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; SSE41-LABEL: eq_i256_pair:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq 16(%rdi), %r9
-; SSE41-NEXT: movq 24(%rdi), %r11
-; SSE41-NEXT: movq (%rdi), %r8
-; SSE41-NEXT: movq 8(%rdi), %r10
-; SSE41-NEXT: xorq 8(%rsi), %r10
-; SSE41-NEXT: xorq 24(%rsi), %r11
-; SSE41-NEXT: xorq (%rsi), %r8
-; SSE41-NEXT: xorq 16(%rsi), %r9
-; SSE41-NEXT: movq 48(%rdi), %rcx
-; SSE41-NEXT: movq 32(%rdi), %rax
-; SSE41-NEXT: movq 56(%rdi), %rdx
+; SSE41-NEXT: movq 16(%rdi), %rcx
+; SSE41-NEXT: movq 24(%rdi), %rdx
+; SSE41-NEXT: movq (%rdi), %rax
+; SSE41-NEXT: movq 8(%rdi), %r8
+; SSE41-NEXT: xorq 8(%rsi), %r8
+; SSE41-NEXT: xorq 24(%rsi), %rdx
+; SSE41-NEXT: xorq (%rsi), %rax
+; SSE41-NEXT: xorq 16(%rsi), %rcx
+; SSE41-NEXT: movq 48(%rdi), %r9
+; SSE41-NEXT: movq 32(%rdi), %r10
+; SSE41-NEXT: movq 56(%rdi), %r11
; SSE41-NEXT: movq 40(%rdi), %rdi
; SSE41-NEXT: xorq 40(%rsi), %rdi
-; SSE41-NEXT: xorq 56(%rsi), %rdx
-; SSE41-NEXT: orq %r11, %rdx
-; SSE41-NEXT: orq %rdi, %rdx
-; SSE41-NEXT: orq %r10, %rdx
-; SSE41-NEXT: xorq 32(%rsi), %rax
-; SSE41-NEXT: xorq 48(%rsi), %rcx
-; SSE41-NEXT: orq %r9, %rcx
-; SSE41-NEXT: orq %rax, %rcx
-; SSE41-NEXT: orq %r8, %rcx
+; SSE41-NEXT: xorq 56(%rsi), %r11
+; SSE41-NEXT: orq %rdx, %r11
+; SSE41-NEXT: orq %rdi, %r11
+; SSE41-NEXT: orq %r8, %r11
+; SSE41-NEXT: xorq 32(%rsi), %r10
+; SSE41-NEXT: xorq 48(%rsi), %r9
+; SSE41-NEXT: orq %rcx, %r9
+; SSE41-NEXT: orq %r10, %r9
+; SSE41-NEXT: orq %rax, %r9
; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: orq %rdx, %rcx
+; SSE41-NEXT: orq %r11, %r9
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
@@ -964,54 +964,54 @@ define i32 @eq_i256_pair(ptr %a, ptr %b) {
define i32 @ne_i512_pair(ptr %a, ptr %b) {
; NO512-LABEL: ne_i512_pair:
; NO512: # %bb.0:
-; NO512-NEXT: movq 32(%rdi), %r8
-; NO512-NEXT: movq 48(%rdi), %r9
+; NO512-NEXT: movq 32(%rdi), %rax
+; NO512-NEXT: movq 48(%rdi), %rcx
; NO512-NEXT: movq 40(%rdi), %rdx
-; NO512-NEXT: movq 56(%rdi), %rcx
-; NO512-NEXT: xorq 56(%rsi), %rcx
-; NO512-NEXT: movq 120(%rdi), %rax
-; NO512-NEXT: xorq 120(%rsi), %rax
-; NO512-NEXT: orq %rcx, %rax
-; NO512-NEXT: movq 88(%rdi), %rcx
-; NO512-NEXT: xorq 88(%rsi), %rcx
-; NO512-NEXT: orq %rcx, %rax
-; NO512-NEXT: movq 24(%rdi), %rcx
-; NO512-NEXT: xorq 24(%rsi), %rcx
+; NO512-NEXT: movq 56(%rdi), %r8
+; NO512-NEXT: xorq 56(%rsi), %r8
+; NO512-NEXT: movq 120(%rdi), %r9
+; NO512-NEXT: xorq 120(%rsi), %r9
+; NO512-NEXT: orq %r8, %r9
+; NO512-NEXT: movq 88(%rdi), %r8
+; NO512-NEXT: xorq 88(%rsi), %r8
+; NO512-NEXT: orq %r8, %r9
+; NO512-NEXT: movq 24(%rdi), %r8
+; NO512-NEXT: xorq 24(%rsi), %r8
; NO512-NEXT: xorq 40(%rsi), %rdx
-; NO512-NEXT: orq %rcx, %rax
-; NO512-NEXT: movq 104(%rdi), %rcx
-; NO512-NEXT: xorq 104(%rsi), %rcx
-; NO512-NEXT: orq %rdx, %rcx
+; NO512-NEXT: orq %r8, %r9
+; NO512-NEXT: movq 104(%rdi), %r8
+; NO512-NEXT: xorq 104(%rsi), %r8
+; NO512-NEXT: orq %rdx, %r8
; NO512-NEXT: movq 72(%rdi), %rdx
; NO512-NEXT: xorq 72(%rsi), %rdx
-; NO512-NEXT: orq %rdx, %rcx
-; NO512-NEXT: movq 16(%rdi), %r10
-; NO512-NEXT: orq %rax, %rcx
-; NO512-NEXT: movq 8(%rdi), %rax
-; NO512-NEXT: xorq 8(%rsi), %rax
-; NO512-NEXT: xorq 48(%rsi), %r9
-; NO512-NEXT: orq %rax, %rcx
-; NO512-NEXT: movq 112(%rdi), %rax
-; NO512-NEXT: xorq 112(%rsi), %rax
-; NO512-NEXT: orq %r9, %rax
-; NO512-NEXT: movq 80(%rdi), %rdx
-; NO512-NEXT: xorq 80(%rsi), %rdx
-; NO512-NEXT: orq %rdx, %rax
-; NO512-NEXT: movq (%rdi), %r9
-; NO512-NEXT: xorq 16(%rsi), %r10
-; NO512-NEXT: xorq (%rsi), %r9
-; NO512-NEXT: xorq 32(%rsi), %r8
-; NO512-NEXT: orq %r10, %rax
+; NO512-NEXT: orq %rdx, %r8
+; NO512-NEXT: movq 16(%rdi), %rdx
+; NO512-NEXT: orq %r9, %r8
+; NO512-NEXT: movq 8(%rdi), %r9
+; NO512-NEXT: xorq 8(%rsi), %r9
+; NO512-NEXT: xorq 48(%rsi), %rcx
+; NO512-NEXT: orq %r9, %r8
+; NO512-NEXT: movq 112(%rdi), %r9
+; NO512-NEXT: xorq 112(%rsi), %r9
+; NO512-NEXT: orq %rcx, %r9
+; NO512-NEXT: movq 80(%rdi), %rcx
+; NO512-NEXT: xorq 80(%rsi), %rcx
+; NO512-NEXT: orq %rcx, %r9
+; NO512-NEXT: movq (%rdi), %rcx
+; NO512-NEXT: xorq 16(%rsi), %rdx
+; NO512-NEXT: xorq (%rsi), %rcx
+; NO512-NEXT: xorq 32(%rsi), %rax
+; NO512-NEXT: orq %rdx, %r9
; NO512-NEXT: movq 96(%rdi), %rdx
; NO512-NEXT: movq 64(%rdi), %rdi
; NO512-NEXT: xorq 64(%rsi), %rdi
; NO512-NEXT: xorq 96(%rsi), %rdx
-; NO512-NEXT: orq %r8, %rdx
-; NO512-NEXT: orq %rdi, %rdx
; NO512-NEXT: orq %rax, %rdx
+; NO512-NEXT: orq %rdi, %rdx
; NO512-NEXT: orq %r9, %rdx
-; NO512-NEXT: xorl %eax, %eax
; NO512-NEXT: orq %rcx, %rdx
+; NO512-NEXT: xorl %eax, %eax
+; NO512-NEXT: orq %r8, %rdx
; NO512-NEXT: setne %al
; NO512-NEXT: retq
;
@@ -1058,54 +1058,54 @@ define i32 @ne_i512_pair(ptr %a, ptr %b) {
define i32 @eq_i512_pair(ptr %a, ptr %b) {
; NO512-LABEL: eq_i512_pair:
; NO512: # %bb.0:
-; NO512-NEXT: movq 32(%rdi), %r8
-; NO512-NEXT: movq 48(%rdi), %r9
+; NO512-NEXT: movq 32(%rdi), %rax
+; NO512-NEXT: movq 48(%rdi), %rcx
; NO512-NEXT: movq 40(%rdi), %rdx
-; NO512-NEXT: movq 56(%rdi), %rcx
-; NO512-NEXT: xorq 56(%rsi), %rcx
-; NO512-NEXT: movq 120(%rdi), %rax
-; NO512-NEXT: xorq 120(%rsi), %rax
-; NO512-NEXT: orq %rcx, %rax
-; NO512-NEXT: movq 88(%rdi), %rcx
-; NO512-NEXT: xorq 88(%rsi), %rcx
-; NO512-NEXT: orq %rcx, %rax
-; NO512-NEXT: movq 24(%rdi), %rcx
-; NO512-NEXT: xorq 24(%rsi), %rcx
+; NO512-NEXT: movq 56(%rdi), %r8
+; NO512-NEXT: xorq 56(%rsi), %r8
+; NO512-NEXT: movq 120(%rdi), %r9
+; NO512-NEXT: xorq 120(%rsi), %r9
+; NO512-NEXT: orq %r8, %r9
+; NO512-NEXT: movq 88(%rdi), %r8
+; NO512-NEXT: xorq 88(%rsi), %r8
+; NO512-NEXT: orq %r8, %r9
+; NO512-NEXT: movq 24(%rdi), %r8
+; NO512-NEXT: xorq 24(%rsi), %r8
; NO512-NEXT: xorq 40(%rsi), %rdx
-; NO512-NEXT: orq %rcx, %rax
-; NO512-NEXT: movq 104(%rdi), %rcx
-; NO512-NEXT: xorq 104(%rsi), %rcx
-; NO512-NEXT: orq %rdx, %rcx
+; NO512-NEXT: orq %r8, %r9
+; NO512-NEXT: movq 104(%rdi), %r8
+; NO512-NEXT: xorq 104(%rsi), %r8
+; NO512-NEXT: orq %rdx, %r8
; NO512-NEXT: movq 72(%rdi), %rdx
; NO512-NEXT: xorq 72(%rsi), %rdx
-; NO512-NEXT: orq %rdx, %rcx
-; NO512-NEXT: movq 16(%rdi), %r10
-; NO512-NEXT: orq %rax, %rcx
-; NO512-NEXT: movq 8(%rdi), %rax
-; NO512-NEXT: xorq 8(%rsi), %rax
-; NO512-NEXT: xorq 48(%rsi), %r9
-; NO512-NEXT: orq %rax, %rcx
-; NO512-NEXT: movq 112(%rdi), %rax
-; NO512-NEXT: xorq 112(%rsi), %rax
-; NO512-NEXT: orq %r9, %rax
-; NO512-NEXT: movq 80(%rdi), %rdx
-; NO512-NEXT: xorq 80(%rsi), %rdx
-; NO512-NEXT: orq %rdx, %rax
-; NO512-NEXT: movq (%rdi), %r9
-; NO512-NEXT: xorq 16(%rsi), %r10
-; NO512-NEXT: xorq (%rsi), %r9
-; NO512-NEXT: xorq 32(%rsi), %r8
-; NO512-NEXT: orq %r10, %rax
+; NO512-NEXT: orq %rdx, %r8
+; NO512-NEXT: movq 16(%rdi), %rdx
+; NO512-NEXT: orq %r9, %r8
+; NO512-NEXT: movq 8(%rdi), %r9
+; NO512-NEXT: xorq 8(%rsi), %r9
+; NO512-NEXT: xorq 48(%rsi), %rcx
+; NO512-NEXT: orq %r9, %r8
+; NO512-NEXT: movq 112(%rdi), %r9
+; NO512-NEXT: xorq 112(%rsi), %r9
+; NO512-NEXT: orq %rcx, %r9
+; NO512-NEXT: movq 80(%rdi), %rcx
+; NO512-NEXT: xorq 80(%rsi), %rcx
+; NO512-NEXT: orq %rcx, %r9
+; NO512-NEXT: movq (%rdi), %rcx
+; NO512-NEXT: xorq 16(%rsi), %rdx
+; NO512-NEXT: xorq (%rsi), %rcx
+; NO512-NEXT: xorq 32(%rsi), %rax
+; NO512-NEXT: orq %rdx, %r9
; NO512-NEXT: movq 96(%rdi), %rdx
; NO512-NEXT: movq 64(%rdi), %rdi
; NO512-NEXT: xorq 64(%rsi), %rdi
; NO512-NEXT: xorq 96(%rsi), %rdx
-; NO512-NEXT: orq %r8, %rdx
-; NO512-NEXT: orq %rdi, %rdx
; NO512-NEXT: orq %rax, %rdx
+; NO512-NEXT: orq %rdi, %rdx
; NO512-NEXT: orq %r9, %rdx
-; NO512-NEXT: xorl %eax, %eax
; NO512-NEXT: orq %rcx, %rdx
+; NO512-NEXT: xorl %eax, %eax
+; NO512-NEXT: orq %r8, %rdx
; NO512-NEXT: sete %al
; NO512-NEXT: retq
;
@@ -1178,18 +1178,18 @@ define i1 @eq_i256_args(i256 %a, i256 %b) {
define i1 @eq_i512_args(i512 %a, i512 %b) {
; ANY-LABEL: eq_i512_args:
; ANY: # %bb.0:
-; ANY-NEXT: movq {{[0-9]+}}(%rsp), %r10
; ANY-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax
+; ANY-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
-; ANY-NEXT: orq %rax, %rcx
+; ANY-NEXT: orq %r10, %rcx
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9
; ANY-NEXT: orq %rcx, %r9
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi
; ANY-NEXT: orq %r9, %rsi
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10
+; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
-; ANY-NEXT: orq %r10, %rdx
+; ANY-NEXT: orq %rax, %rdx
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8
; ANY-NEXT: orq %rdx, %r8
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi
@@ -1305,24 +1305,24 @@ define i1 @eq_i256_load_arg(ptr%p, i256 %b) {
define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
; ANY-LABEL: eq_i512_load_arg:
; ANY: # %bb.0:
-; ANY-NEXT: movq 40(%rdi), %r10
-; ANY-NEXT: movq 48(%rdi), %rax
+; ANY-NEXT: movq 40(%rdi), %rax
+; ANY-NEXT: movq 48(%rdi), %r10
; ANY-NEXT: movq 56(%rdi), %r11
; ANY-NEXT: xorq 24(%rdi), %r8
; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r11
; ANY-NEXT: orq %r8, %r11
; ANY-NEXT: xorq 8(%rdi), %rdx
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10
-; ANY-NEXT: orq %r11, %r10
-; ANY-NEXT: orq %rdx, %r10
+; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax
+; ANY-NEXT: orq %r11, %rax
+; ANY-NEXT: orq %rdx, %rax
; ANY-NEXT: xorq 32(%rdi), %r9
; ANY-NEXT: xorq (%rdi), %rsi
; ANY-NEXT: xorq 16(%rdi), %rcx
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax
-; ANY-NEXT: orq %rcx, %rax
-; ANY-NEXT: orq %r9, %rax
-; ANY-NEXT: orq %rsi, %rax
-; ANY-NEXT: orq %r10, %rax
+; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10
+; ANY-NEXT: orq %rcx, %r10
+; ANY-NEXT: orq %r9, %r10
+; ANY-NEXT: orq %rsi, %r10
+; ANY-NEXT: orq %rax, %r10
; ANY-NEXT: sete %al
; ANY-NEXT: retq
%a = load i512, ptr %p
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index d85f4f520bd25..8fb34ff811875 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -1014,27 +1014,27 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
;
; x86_64-LABEL: test_ashr_v2i128:
; x86_64: # %bb.0: # %entry
-; x86_64-NEXT: movq %rcx, %r11
+; x86_64-NEXT: movq %rcx, %rax
; x86_64-NEXT: movq {{[0-9]+}}(%rsp), %r10
; x86_64-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
; x86_64-NEXT: movl %r9d, %ecx
-; x86_64-NEXT: shrdq %cl, %r11, %rdx
+; x86_64-NEXT: shrdq %cl, %rax, %rdx
; x86_64-NEXT: movl %r8d, %ecx
; x86_64-NEXT: shrdq %cl, %rsi, %rdi
-; x86_64-NEXT: movq %rsi, %rax
-; x86_64-NEXT: sarq %cl, %rax
+; x86_64-NEXT: movq %rsi, %r11
+; x86_64-NEXT: sarq %cl, %r11
; x86_64-NEXT: sarq $63, %rsi
; x86_64-NEXT: testb $64, %r8b
-; x86_64-NEXT: cmovneq %rax, %rdi
-; x86_64-NEXT: cmoveq %rax, %rsi
-; x86_64-NEXT: movq %r11, %rax
+; x86_64-NEXT: cmovneq %r11, %rdi
+; x86_64-NEXT: cmoveq %r11, %rsi
+; x86_64-NEXT: movq %rax, %r8
; x86_64-NEXT: movl %r9d, %ecx
-; x86_64-NEXT: sarq %cl, %rax
-; x86_64-NEXT: sarq $63, %r11
+; x86_64-NEXT: sarq %cl, %r8
+; x86_64-NEXT: sarq $63, %rax
; x86_64-NEXT: testb $64, %r9b
-; x86_64-NEXT: cmovneq %rax, %rdx
-; x86_64-NEXT: cmoveq %rax, %r11
-; x86_64-NEXT: movq %r11, 24(%r10)
+; x86_64-NEXT: cmovneq %r8, %rdx
+; x86_64-NEXT: cmoveq %r8, %rax
+; x86_64-NEXT: movq %rax, 24(%r10)
; x86_64-NEXT: movq %rdx, 16(%r10)
; x86_64-NEXT: movq %rsi, 8(%r10)
; x86_64-NEXT: movq %rdi, (%r10)
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 8948e9d702b1e..26e2382368d1d 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -2207,34 +2207,34 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
;
; X64-SSE-LABEL: PR34947:
; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movzwl 16(%rdi), %r8d
+; X64-SSE-NEXT: movzwl 16(%rdi), %ecx
; X64-SSE-NEXT: movdqa (%rdi), %xmm3
; X64-SSE-NEXT: movdqa (%rsi), %xmm0
; X64-SSE-NEXT: movdqa 16(%rsi), %xmm1
; X64-SSE-NEXT: pxor %xmm4, %xmm4
; X64-SSE-NEXT: movdqa %xmm3, %xmm2
; X64-SSE-NEXT: pextrw $7, %xmm3, %eax
-; X64-SSE-NEXT: pextrw $4, %xmm3, %r9d
-; X64-SSE-NEXT: pextrw $0, %xmm3, %r10d
-; X64-SSE-NEXT: pextrw $1, %xmm3, %r11d
-; X64-SSE-NEXT: pextrw $3, %xmm3, %ecx
+; X64-SSE-NEXT: pextrw $4, %xmm3, %edi
+; X64-SSE-NEXT: pextrw $0, %xmm3, %r8d
+; X64-SSE-NEXT: pextrw $1, %xmm3, %r9d
+; X64-SSE-NEXT: pextrw $3, %xmm3, %r10d
; X64-SSE-NEXT: movdqa %xmm3, %xmm5
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X64-SSE-NEXT: movd %xmm3, %edi
+; X64-SSE-NEXT: movd %xmm3, %r11d
; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl %edi
+; X64-SSE-NEXT: divl %r11d
; X64-SSE-NEXT: movd %edx, %xmm3
; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
; X64-SSE-NEXT: movd %xmm4, %eax
; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; X64-SSE-NEXT: movd %xmm4, %edi
+; X64-SSE-NEXT: movd %xmm4, %r11d
; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl %edi
+; X64-SSE-NEXT: divl %r11d
; X64-SSE-NEXT: movd %edx, %xmm4
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; X64-SSE-NEXT: movl %r9d, %eax
+; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl 16(%rsi)
; X64-SSE-NEXT: movd %edx, %xmm3
@@ -2247,33 +2247,33 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
; X64-SSE-NEXT: movd %edx, %xmm1
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; X64-SSE-NEXT: movl %r10d, %eax
+; X64-SSE-NEXT: movl %r8d, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl (%rsi)
; X64-SSE-NEXT: movd %edx, %xmm1
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; X64-SSE-NEXT: movd %xmm2, %edi
-; X64-SSE-NEXT: movl %r11d, %eax
+; X64-SSE-NEXT: movl %r9d, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl %edi
; X64-SSE-NEXT: movd %edx, %xmm2
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
; X64-SSE-NEXT: movd %xmm2, %edi
-; X64-SSE-NEXT: movl %ecx, %eax
+; X64-SSE-NEXT: movl %r10d, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl %edi
; X64-SSE-NEXT: movd %edx, %xmm2
; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3]
; X64-SSE-NEXT: movd %xmm4, %eax
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-SSE-NEXT: movd %xmm0, %ecx
+; X64-SSE-NEXT: movd %xmm0, %edi
; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl %ecx
+; X64-SSE-NEXT: divl %edi
; X64-SSE-NEXT: movd %edx, %xmm0
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-SSE-NEXT: movl %r8d, %eax
+; X64-SSE-NEXT: movl %ecx, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl 32(%rsi)
; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
@@ -2305,27 +2305,27 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
; X64-AVX1-NEXT: vmovd %xmm2, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl 32(%rsi)
-; X64-AVX1-NEXT: movl %edx, %r8d
+; X64-AVX1-NEXT: movl %edx, %ecx
; X64-AVX1-NEXT: vpextrd $3, %xmm1, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl 28(%rsi)
-; X64-AVX1-NEXT: movl %edx, %r9d
+; X64-AVX1-NEXT: movl %edx, %edi
; X64-AVX1-NEXT: vpextrd $2, %xmm1, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl 24(%rsi)
-; X64-AVX1-NEXT: movl %edx, %r10d
+; X64-AVX1-NEXT: movl %edx, %r8d
; X64-AVX1-NEXT: vpextrd $1, %xmm1, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl 20(%rsi)
-; X64-AVX1-NEXT: movl %edx, %r11d
+; X64-AVX1-NEXT: movl %edx, %r9d
; X64-AVX1-NEXT: vmovd %xmm1, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl 16(%rsi)
-; X64-AVX1-NEXT: movl %edx, %ecx
+; X64-AVX1-NEXT: movl %edx, %r10d
; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl 12(%rsi)
-; X64-AVX1-NEXT: movl %edx, %edi
+; X64-AVX1-NEXT: movl %edx, %r11d
; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax
; X64-AVX1-NEXT: xorl %edx, %edx
; X64-AVX1-NEXT: divl 8(%rsi)
@@ -2340,15 +2340,15 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
; X64-AVX1-NEXT: vmovd %edx, %xmm0
; X64-AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0
; X64-AVX1-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpinsrd $3, %r11d, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vmovd %ecx, %xmm2
-; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
-; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2
-; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2
+; X64-AVX1-NEXT: vmovd %r10d, %xmm2
+; X64-AVX1-NEXT: vpinsrd $1, %r9d, %xmm2, %xmm2
+; X64-AVX1-NEXT: vpinsrd $2, %r8d, %xmm2, %xmm2
+; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2
; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT: imull $8199, %r8d, %eax # imm = 0x2007
+; X64-AVX1-NEXT: imull $8199, %ecx, %eax # imm = 0x2007
; X64-AVX1-NEXT: movl %eax, (%rax)
; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax)
; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax)
diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index 8199c68616a9f..83802ce434426 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -841,148 +841,147 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %r9, %rbp
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r8, %r13
-; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movq %r9, %r10
+; X64-NEXT: movq %r8, %rbp
+; X64-NEXT: movq %rcx, %r12
; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rsi, %rbx
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; X64-NEXT: andl $1, %ecx
; X64-NEXT: negq %rcx
-; X64-NEXT: andl $1, %r14d
-; X64-NEXT: negq %r14
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r13
+; X64-NEXT: andl $1, %r12d
+; X64-NEXT: negq %r12
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq %rax, %r15
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: addq %rdx, %r11
+; X64-NEXT: addq %rdx, %r15
; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r10
+; X64-NEXT: movq %r10, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: addq %rax, %r11
+; X64-NEXT: addq %rax, %r15
; X64-NEXT: adcq %rdx, %r9
-; X64-NEXT: setb %bl
-; X64-NEXT: movzbl %bl, %r10d
+; X64-NEXT: setb %dil
+; X64-NEXT: movzbl %dil, %r10d
; X64-NEXT: addq %rax, %r9
; X64-NEXT: adcq %rdx, %r10
; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r13
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r15, %rbp
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %rsi, %r11
; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: movq %rsi, %r12
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: addq %rbp, %rax
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: addq %r11, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rdi, %rsi
+; X64-NEXT: adcq %rdi, %rbx
; X64-NEXT: setb %dil
; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: addq %rsi, %rax
-; X64-NEXT: movzbl %dil, %edx
-; X64-NEXT: adcq %rdx, %r15
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: adcq %r11, %r15
+; X64-NEXT: movzbl %dil, %edx
+; X64-NEXT: adcq %rdx, %rsi
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: adcq %r15, %rsi
; X64-NEXT: adcq $0, %r9
; X64-NEXT: adcq $0, %r10
; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: mulq %r13
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %r12
+; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: movq %rax, %r8
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r11, %rax
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: addq %r11, %rbp
-; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: addq %r11, %rbx
+; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: addq %r12, %rbp
-; X64-NEXT: adcq %rax, %r11
+; X64-NEXT: adcq $0, %r15
+; X64-NEXT: addq %r13, %rbx
+; X64-NEXT: adcq %r11, %r15
; X64-NEXT: setb %al
-; X64-NEXT: addq %rdi, %r11
+; X64-NEXT: addq %r8, %r15
; X64-NEXT: movzbl %al, %r8d
; X64-NEXT: adcq %rdx, %r8
-; X64-NEXT: addq %r12, %rsi
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r15, %rbp
-; X64-NEXT: adcq $0, %r11
+; X64-NEXT: addq %r13, %rdi
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %rsi, %rbx
+; X64-NEXT: adcq $0, %r15
; X64-NEXT: adcq $0, %r8
-; X64-NEXT: addq %r9, %r11
+; X64-NEXT: addq %r9, %r15
; X64-NEXT: adcq %r10, %r8
; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rdx, %r15
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %rdx, %r11
; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: addq %rax, %r15
+; X64-NEXT: addq %rax, %r11
; X64-NEXT: adcq %rdx, %rdi
; X64-NEXT: setb %r9b
; X64-NEXT: addq %rax, %rdi
; X64-NEXT: movzbl %r9b, %esi
; X64-NEXT: adcq %rdx, %rsi
-; X64-NEXT: addq %rax, %r11
-; X64-NEXT: adcq %r8, %r15
+; X64-NEXT: addq %rax, %r15
+; X64-NEXT: adcq %r8, %r11
; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %rdi
; X64-NEXT: adcq $0, %rsi
; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; X64-NEXT: movq %rsi, %r8
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: addq %rax, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: addq %r14, %r8
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; X64-NEXT: movq %rdx, %r10
; X64-NEXT: adcq $0, %r10
-; X64-NEXT: addq %rbx, %r8
-; X64-NEXT: adcq %rax, %r10
-; X64-NEXT: movq %rax, %r9
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: addq %r9, %r8
+; X64-NEXT: adcq %r14, %r10
; X64-NEXT: setb %al
; X64-NEXT: addq %rsi, %r10
; X64-NEXT: movzbl %al, %esi
; X64-NEXT: adcq %rdx, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: imulq %r14, %rax
-; X64-NEXT: addq %r9, %rax
-; X64-NEXT: imulq %r14, %r13
-; X64-NEXT: addq %rax, %r13
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: imulq %r12, %rax
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: imulq %r12, %rbp
+; X64-NEXT: addq %rax, %rbp
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: imulq %rcx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rbx, %r14
-; X64-NEXT: addq %rax, %r14
-; X64-NEXT: adcq %rdx, %r13
-; X64-NEXT: addq %r10, %r14
-; X64-NEXT: adcq %rsi, %r13
-; X64-NEXT: movq %r12, %rbx
+; X64-NEXT: movq %r9, %r12
+; X64-NEXT: addq %rax, %r12
+; X64-NEXT: adcq %rdx, %rbp
+; X64-NEXT: addq %r10, %r12
+; X64-NEXT: adcq %rsi, %rbp
+; X64-NEXT: movq %r13, %r14
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: addq %rsi, %rbx
+; X64-NEXT: addq %rsi, %r14
; X64-NEXT: adcq $0, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: addq %rdx, %rbx
+; X64-NEXT: addq %rdx, %r14
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; X64-NEXT: adcq %r9, %rsi
; X64-NEXT: setb %r10b
@@ -994,35 +993,35 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
; X64-NEXT: imulq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: addq %r9, %rcx
-; X64-NEXT: addq %r12, %rax
+; X64-NEXT: addq %r13, %rax
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: addq %rsi, %rax
; X64-NEXT: adcq %r10, %rcx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; X64-NEXT: adcq %r8, %rbx
-; X64-NEXT: adcq %r14, %rax
-; X64-NEXT: adcq %r13, %rcx
-; X64-NEXT: addq %r11, %r12
-; X64-NEXT: adcq %r15, %rbx
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: adcq %r8, %r14
+; X64-NEXT: adcq %r12, %rax
+; X64-NEXT: adcq %rbp, %rcx
+; X64-NEXT: addq %r15, %r13
+; X64-NEXT: adcq %r11, %r14
; X64-NEXT: adcq %rdi, %rax
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT: movq %rbp, %rdx
+; X64-NEXT: movq %rbx, %rdx
; X64-NEXT: sarq $63, %rdx
; X64-NEXT: xorq %rdx, %rcx
-; X64-NEXT: xorq %rdx, %rbx
-; X64-NEXT: orq %rcx, %rbx
+; X64-NEXT: xorq %rdx, %r14
+; X64-NEXT: orq %rcx, %r14
; X64-NEXT: xorq %rdx, %rax
-; X64-NEXT: orq %rbx, %rax
-; X64-NEXT: xorq %r12, %rdx
+; X64-NEXT: orq %r14, %rax
+; X64-NEXT: xorq %r13, %rdx
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: movl %eax, %esi
; X64-NEXT: andl $1, %esi
; X64-NEXT: movq %rsi, %rcx
; X64-NEXT: negq %rcx
-; X64-NEXT: xorq %rcx, %rbp
+; X64-NEXT: xorq %rcx, %rbx
; X64-NEXT: xorq %rax, %rcx
-; X64-NEXT: orq %rbp, %rcx
+; X64-NEXT: orq %rbx, %rcx
; X64-NEXT: orq %rdx, %rcx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 7b2695568c618..367ca660cda14 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -9,71 +9,67 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: pushq %r14
; X64-NEXT: .cfi_def_cfa_offset 24
-; X64-NEXT: pushq %r12
-; X64-NEXT: .cfi_def_cfa_offset 32
; X64-NEXT: pushq %rbx
-; X64-NEXT: .cfi_def_cfa_offset 40
-; X64-NEXT: .cfi_offset %rbx, -40
-; X64-NEXT: .cfi_offset %r12, -32
+; X64-NEXT: .cfi_def_cfa_offset 32
+; X64-NEXT: .cfi_offset %rbx, -32
; X64-NEXT: .cfi_offset %r14, -24
; X64-NEXT: .cfi_offset %r15, -16
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rsi, %r10
-; X64-NEXT: movq %rdi, %r15
-; X64-NEXT: sarq $63, %rsi
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rdi, %r10
+; X64-NEXT: movq %rsi, %r14
+; X64-NEXT: sarq $63, %r14
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: imulq %rsi, %rdi
+; X64-NEXT: imulq %r14, %rdi
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: mulq %rsi
+; X64-NEXT: mulq %r14
; X64-NEXT: movq %rax, %r9
; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: imulq %rcx, %rsi
-; X64-NEXT: addq %rdx, %rsi
+; X64-NEXT: imulq %rcx, %r14
+; X64-NEXT: addq %rdx, %r14
; X64-NEXT: movq %rcx, %rdi
; X64-NEXT: sarq $63, %rdi
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: imulq %r10, %rbx
+; X64-NEXT: movq %rdi, %r15
+; X64-NEXT: imulq %rsi, %r15
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r15
+; X64-NEXT: mulq %r10
; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rbx, %rdx
-; X64-NEXT: imulq %r15, %rdi
+; X64-NEXT: addq %r15, %rdx
+; X64-NEXT: imulq %r10, %rdi
; X64-NEXT: addq %rdx, %rdi
; X64-NEXT: addq %r9, %r11
-; X64-NEXT: adcq %rsi, %rdi
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: adcq %r14, %rdi
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %r14, %rsi
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %r14, %r15
; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rsi, %r14
-; X64-NEXT: adcq %rbx, %r15
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %r15, %r10
+; X64-NEXT: adcq %rbx, %r14
; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %esi
-; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movzbl %al, %ebx
+; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: addq %r15, %rax
-; X64-NEXT: adcq %rsi, %rdx
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: adcq %rbx, %rdx
; X64-NEXT: addq %r11, %rax
; X64-NEXT: adcq %rdi, %rdx
-; X64-NEXT: movq %r14, 8(%r8)
-; X64-NEXT: sarq $63, %r14
-; X64-NEXT: xorq %r14, %rdx
-; X64-NEXT: xorq %rax, %r14
-; X64-NEXT: orq %rdx, %r14
+; X64-NEXT: movq %r10, 8(%r8)
+; X64-NEXT: sarq $63, %r10
+; X64-NEXT: xorq %r10, %rdx
+; X64-NEXT: xorq %rax, %r10
+; X64-NEXT: orq %rdx, %r10
; X64-NEXT: setne %al
; X64-NEXT: movq %r9, (%r8)
; X64-NEXT: popq %rbx
-; X64-NEXT: popq %r12
; X64-NEXT: popq %r14
; X64-NEXT: popq %r15
; X64-NEXT: retq
@@ -364,228 +360,232 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X64-NEXT: .cfi_offset %r14, -32
; X64-NEXT: .cfi_offset %r15, -24
; X64-NEXT: .cfi_offset %rbp, -16
+; X64-NEXT: movq %rcx, %r11
; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rsi, %r14
+; X64-NEXT: movq %rsi, %r15
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r10, %rbp
-; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %rsi, %r10
+; X64-NEXT: adcq $0, %rcx
; X64-NEXT: movq %rbx, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rbp, %r12
-; X64-NEXT: adcq %rsi, %rbx
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %r10, %r14
+; X64-NEXT: adcq %rcx, %r12
; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %esi
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movzbl %al, %ecx
+; X64-NEXT: movq %r11, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rbx, %r10
-; X64-NEXT: adcq %rsi, %rdx
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r12, %rbx
+; X64-NEXT: adcq %rcx, %r11
; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq %r8, %rcx
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rsi, %rbx
-; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %r8, %r13
+; X64-NEXT: adcq $0, %r12
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %r13
+; X64-NEXT: movq %r9, %rsi
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: addq %r13, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %rbp, %rdi
-; X64-NEXT: setb %bl
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %rdi, %rsi
-; X64-NEXT: movzbl %bl, %eax
-; X64-NEXT: adcq %rax, %r9
+; X64-NEXT: adcq %r12, %r10
+; X64-NEXT: setb %cl
+; X64-NEXT: movq %r15, %r9
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %r10, %r8
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: adcq %rax, %rbp
; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; X64-NEXT: addq %r11, %rsi
-; X64-NEXT: adcq %r12, %r9
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r13, %rbx
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r13, %rax
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload
+; X64-NEXT: adcq %r14, %rbp
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: adcq $0, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %r9, %rsi
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rdi, %rcx
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq %rbp, %rdi
-; X64-NEXT: setb %r11b
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rdi, %rbp
-; X64-NEXT: movzbl %r11b, %eax
-; X64-NEXT: adcq %rax, %rcx
-; X64-NEXT: addq %rsi, %r12
-; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r9, %rbx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: addq %r10, %rbp
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
-; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %r10, %r9
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: addq %r9, %rax
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: adcq %r13, %r11
+; X64-NEXT: setb %cl
; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r15
+; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %r11, %r13
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: adcq %rax, %r10
+; X64-NEXT: addq %r8, %r14
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %rbp, %rdi
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: addq %rbx, %r13
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload
+; X64-NEXT: setb %cl
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
+; X64-NEXT: movq %r9, %rax
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r10, %rbx
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r13
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: adcq %rdi, %rsi
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rsi, %r11
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: adcq %rax, %r12
-; X64-NEXT: addq %rbp, %r9
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %rcx, %rbx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X64-NEXT: adcq %rax, %r11
-; X64-NEXT: adcq $0, %r12
-; X64-NEXT: movq %r14, %rsi
-; X64-NEXT: sarq $63, %rsi
-; X64-NEXT: movq %rsi, %rdi
-; X64-NEXT: imulq %r13, %rdi
-; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
+; X64-NEXT: movq %rbx, %rax
; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: imulq %rsi, %r15
-; X64-NEXT: addq %rdx, %r15
-; X64-NEXT: movq %rsi, %rbx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp ## 8-byte Reload
-; X64-NEXT: imulq %rbp, %rbx
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: addq %rdx, %rbx
-; X64-NEXT: imulq %rsi, %r8
-; X64-NEXT: addq %rbx, %r8
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: addq %rax, %rcx
-; X64-NEXT: adcq %r15, %r8
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rdi, %r15
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rsi, %r8
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: addq %r8, %rax
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: adcq %rdi, %r9
+; X64-NEXT: setb %r8b
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %r9, %r14
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: adcq %rax, %rbp
+; X64-NEXT: addq %r13, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %r10, %rsi
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: adcq %rax, %r14
; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: addq %rbx, %r15
-; X64-NEXT: adcq %rdi, %rbp
-; X64-NEXT: setb %bl
-; X64-NEXT: addq %rax, %rbp
-; X64-NEXT: movzbl %bl, %r9d
-; X64-NEXT: adcq %rdx, %r9
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: adcq %r8, %r9
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: movq %rbx, %r10
; X64-NEXT: sarq $63, %r13
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: imulq %r12, %rcx
+; X64-NEXT: movq %r13, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: imulq %r13, %r15
+; X64-NEXT: addq %rdx, %r15
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
+; X64-NEXT: imulq %rdi, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r13
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: addq %rdx, %rcx
; X64-NEXT: imulq %r13, %rsi
+; X64-NEXT: addq %rcx, %rsi
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: addq %rax, %r8
+; X64-NEXT: adcq %r15, %rsi
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: addq %rdx, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload
-; X64-NEXT: movq %r10, %rcx
-; X64-NEXT: imulq %r13, %rcx
-; X64-NEXT: addq %rsi, %rcx
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %r9, %r15
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: addq %rcx, %r15
+; X64-NEXT: adcq %r9, %r13
+; X64-NEXT: setb %cl
+; X64-NEXT: addq %rax, %r13
+; X64-NEXT: movzbl %cl, %r9d
+; X64-NEXT: adcq %rdx, %r9
+; X64-NEXT: addq %r8, %r13
+; X64-NEXT: adcq %rsi, %r9
+; X64-NEXT: sarq $63, %r12
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: imulq %r12, %r8
+; X64-NEXT: mulq %r12
; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: imulq %r13, %rsi
-; X64-NEXT: mulq %r13
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: addq %rdx, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: imulq %r12, %rbx
+; X64-NEXT: addq %r8, %rbx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: imulq %r12, %rcx
+; X64-NEXT: mulq %r12
; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rsi, %rdx
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: imulq %r13, %rax
-; X64-NEXT: addq %rdx, %rax
-; X64-NEXT: addq %rdi, %r8
-; X64-NEXT: adcq %rcx, %rax
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %rdi, %rsi
-; X64-NEXT: addq %rbx, %rsi
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: addq %rax, %rsi
-; X64-NEXT: adcq %rdx, %rbx
-; X64-NEXT: setb %cl
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: imulq %r12, %r10
+; X64-NEXT: addq %rdx, %r10
+; X64-NEXT: addq %rsi, %r8
+; X64-NEXT: adcq %rbx, %r10
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: addq %r11, %rbx
+; X64-NEXT: adcq $0, %r11
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %rdi
; X64-NEXT: addq %rax, %rbx
+; X64-NEXT: adcq %rdx, %r11
+; X64-NEXT: setb %cl
+; X64-NEXT: addq %rax, %r11
; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: adcq %rdx, %rax
-; X64-NEXT: addq %r8, %rbx
-; X64-NEXT: adcq %r14, %rax
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Folded Reload
-; X64-NEXT: adcq %r15, %rsi
-; X64-NEXT: adcq %rbp, %rbx
+; X64-NEXT: addq %r8, %r11
+; X64-NEXT: adcq %r10, %rax
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
+; X64-NEXT: adcq %r15, %rbx
+; X64-NEXT: adcq %r13, %r11
; X64-NEXT: adcq %r9, %rax
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT: adcq %r11, %rbx
-; X64-NEXT: adcq %r12, %rax
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload
+; X64-NEXT: adcq %r14, %r11
+; X64-NEXT: adcq %rbp, %rax
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: sarq $63, %rcx
; X64-NEXT: xorq %rcx, %rax
-; X64-NEXT: xorq %rcx, %rsi
-; X64-NEXT: orq %rax, %rsi
; X64-NEXT: xorq %rcx, %rbx
-; X64-NEXT: xorq %rdi, %rcx
+; X64-NEXT: orq %rax, %rbx
+; X64-NEXT: xorq %rcx, %r11
+; X64-NEXT: xorq %rsi, %rcx
+; X64-NEXT: orq %r11, %rcx
; X64-NEXT: orq %rbx, %rcx
-; X64-NEXT: orq %rsi, %rcx
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
; X64-NEXT: movq %rdx, 24(%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll b/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
index af6b9f009bea8..3f2226aec2d3f 100644
--- a/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
+++ b/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
@@ -148,12 +148,12 @@ define i32 @test_calls_and_rets_noredzone(ptr%ptr) nounwind noredzone {
; X64-NOPIC-NEXT: sarq $63, %rax
; X64-NOPIC-NEXT: shlq $47, %rax
; X64-NOPIC-NEXT: orq %rax, %rsp
-; X64-NOPIC-NEXT: movq $.Lslh_ret_addr2, %rbp
+; X64-NOPIC-NEXT: movq $.Lslh_ret_addr2, %r15
; X64-NOPIC-NEXT: callq f at PLT
; X64-NOPIC-NEXT: .Lslh_ret_addr2:
; X64-NOPIC-NEXT: movq %rsp, %rax
; X64-NOPIC-NEXT: sarq $63, %rax
-; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr2, %rbp
+; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr2, %r15
; X64-NOPIC-NEXT: cmovneq %r14, %rax
; X64-NOPIC-NEXT: movl (%rbx), %ebp
; X64-NOPIC-NEXT: shlq $47, %rax
@@ -190,13 +190,13 @@ define i32 @test_calls_and_rets_noredzone(ptr%ptr) nounwind noredzone {
; X64-NOPIC-MCM-NEXT: sarq $63, %rax
; X64-NOPIC-MCM-NEXT: shlq $47, %rax
; X64-NOPIC-MCM-NEXT: orq %rax, %rsp
-; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr2(%rip), %rbp
+; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr2(%rip), %r15
; X64-NOPIC-MCM-NEXT: callq f at PLT
; X64-NOPIC-MCM-NEXT: .Lslh_ret_addr2:
; X64-NOPIC-MCM-NEXT: movq %rsp, %rax
; X64-NOPIC-MCM-NEXT: sarq $63, %rax
; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr2(%rip), %rcx
-; X64-NOPIC-MCM-NEXT: cmpq %rcx, %rbp
+; X64-NOPIC-MCM-NEXT: cmpq %rcx, %r15
; X64-NOPIC-MCM-NEXT: cmovneq %r14, %rax
; X64-NOPIC-MCM-NEXT: movl (%rbx), %ebp
; X64-NOPIC-MCM-NEXT: shlq $47, %rax
@@ -234,13 +234,13 @@ define i32 @test_calls_and_rets_noredzone(ptr%ptr) nounwind noredzone {
; X64-PIC-NEXT: sarq $63, %rax
; X64-PIC-NEXT: shlq $47, %rax
; X64-PIC-NEXT: orq %rax, %rsp
-; X64-PIC-NEXT: leaq .Lslh_ret_addr2(%rip), %rbp
+; X64-PIC-NEXT: leaq .Lslh_ret_addr2(%rip), %r15
; X64-PIC-NEXT: callq f at PLT
; X64-PIC-NEXT: .Lslh_ret_addr2:
; X64-PIC-NEXT: movq %rsp, %rax
; X64-PIC-NEXT: sarq $63, %rax
; X64-PIC-NEXT: leaq .Lslh_ret_addr2(%rip), %rcx
-; X64-PIC-NEXT: cmpq %rcx, %rbp
+; X64-PIC-NEXT: cmpq %rcx, %r15
; X64-PIC-NEXT: cmovneq %r14, %rax
; X64-PIC-NEXT: movl (%rbx), %ebp
; X64-PIC-NEXT: shlq $47, %rax
@@ -295,18 +295,18 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
; X64-NOPIC-NEXT: shlq $47, %rax
; X64-NOPIC-NEXT: movq %r14, %rdi
; X64-NOPIC-NEXT: orq %rax, %rsp
-; X64-NOPIC-NEXT: movq $.Lslh_ret_addr4, %rbp
+; X64-NOPIC-NEXT: movq $.Lslh_ret_addr4, %r12
; X64-NOPIC-NEXT: callq setjmp at PLT
; X64-NOPIC-NEXT: .Lslh_ret_addr4:
; X64-NOPIC-NEXT: movq %rsp, %rax
; X64-NOPIC-NEXT: sarq $63, %rax
-; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr4, %rbp
+; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr4, %r12
; X64-NOPIC-NEXT: cmovneq %r15, %rax
-; X64-NOPIC-NEXT: movl (%rbx), %ebp
-; X64-NOPIC-NEXT: movl $42, %r12d
+; X64-NOPIC-NEXT: movl (%rbx), %r12d
+; X64-NOPIC-NEXT: movl $42, %ebp
; X64-NOPIC-NEXT: shlq $47, %rax
; X64-NOPIC-NEXT: movq %r14, %rdi
-; X64-NOPIC-NEXT: movl %r12d, %esi
+; X64-NOPIC-NEXT: movl %ebp, %esi
; X64-NOPIC-NEXT: orq %rax, %rsp
; X64-NOPIC-NEXT: movq $.Lslh_ret_addr5, %r13
; X64-NOPIC-NEXT: callq sigsetjmp at PLT
@@ -315,11 +315,11 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
; X64-NOPIC-NEXT: sarq $63, %rax
; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr5, %r13
; X64-NOPIC-NEXT: cmovneq %r15, %rax
-; X64-NOPIC-NEXT: addl (%rbx), %ebp
+; X64-NOPIC-NEXT: addl (%rbx), %r12d
; X64-NOPIC-NEXT: shlq $47, %rax
; X64-NOPIC-NEXT: movq %r14, %rdi
; X64-NOPIC-NEXT: movq %r14, %rsi
-; X64-NOPIC-NEXT: movl %r12d, %edx
+; X64-NOPIC-NEXT: movl %ebp, %edx
; X64-NOPIC-NEXT: orq %rax, %rsp
; X64-NOPIC-NEXT: movq $.Lslh_ret_addr6, %r14
; X64-NOPIC-NEXT: callq __sigsetjmp at PLT
@@ -329,8 +329,8 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr6, %r14
; X64-NOPIC-NEXT: movq %rax, %rcx
; X64-NOPIC-NEXT: cmovneq %r15, %rcx
-; X64-NOPIC-NEXT: addl (%rbx), %ebp
-; X64-NOPIC-NEXT: movl %ebp, %eax
+; X64-NOPIC-NEXT: addl (%rbx), %r12d
+; X64-NOPIC-NEXT: movl %r12d, %eax
; X64-NOPIC-NEXT: orl %ecx, %eax
; X64-NOPIC-NEXT: shlq $47, %rcx
; X64-NOPIC-NEXT: orq %rcx, %rsp
@@ -360,19 +360,19 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
; X64-NOPIC-MCM-NEXT: shlq $47, %rax
; X64-NOPIC-MCM-NEXT: movq %r14, %rdi
; X64-NOPIC-MCM-NEXT: orq %rax, %rsp
-; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr4(%rip), %rbp
+; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr4(%rip), %r12
; X64-NOPIC-MCM-NEXT: callq setjmp at PLT
; X64-NOPIC-MCM-NEXT: .Lslh_ret_addr4:
; X64-NOPIC-MCM-NEXT: movq %rsp, %rax
; X64-NOPIC-MCM-NEXT: sarq $63, %rax
; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr4(%rip), %rcx
-; X64-NOPIC-MCM-NEXT: cmpq %rcx, %rbp
+; X64-NOPIC-MCM-NEXT: cmpq %rcx, %r12
; X64-NOPIC-MCM-NEXT: cmovneq %r15, %rax
-; X64-NOPIC-MCM-NEXT: movl (%rbx), %ebp
-; X64-NOPIC-MCM-NEXT: movl $42, %r12d
+; X64-NOPIC-MCM-NEXT: movl (%rbx), %r12d
+; X64-NOPIC-MCM-NEXT: movl $42, %ebp
; X64-NOPIC-MCM-NEXT: shlq $47, %rax
; X64-NOPIC-MCM-NEXT: movq %r14, %rdi
-; X64-NOPIC-MCM-NEXT: movl %r12d, %esi
+; X64-NOPIC-MCM-NEXT: movl %ebp, %esi
; X64-NOPIC-MCM-NEXT: orq %rax, %rsp
; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr5(%rip), %r13
; X64-NOPIC-MCM-NEXT: callq sigsetjmp at PLT
@@ -382,11 +382,11 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr5(%rip), %rcx
; X64-NOPIC-MCM-NEXT: cmpq %rcx, %r13
; X64-NOPIC-MCM-NEXT: cmovneq %r15, %rax
-; X64-NOPIC-MCM-NEXT: addl (%rbx), %ebp
+; X64-NOPIC-MCM-NEXT: addl (%rbx), %r12d
; X64-NOPIC-MCM-NEXT: shlq $47, %rax
; X64-NOPIC-MCM-NEXT: movq %r14, %rdi
; X64-NOPIC-MCM-NEXT: movq %r14, %rsi
-; X64-NOPIC-MCM-NEXT: movl %r12d, %edx
+; X64-NOPIC-MCM-NEXT: movl %ebp, %edx
; X64-NOPIC-MCM-NEXT: orq %rax, %rsp
; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr6(%rip), %r14
; X64-NOPIC-MCM-NEXT: callq __sigsetjmp at PLT
@@ -397,8 +397,8 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
; X64-NOPIC-MCM-NEXT: cmpq %rcx, %r14
; X64-NOPIC-MCM-NEXT: movq %rax, %rcx
; X64-NOPIC-MCM-NEXT: cmovneq %r15, %rcx
-; X64-NOPIC-MCM-NEXT: addl (%rbx), %ebp
-; X64-NOPIC-MCM-NEXT: movl %ebp, %eax
+; X64-NOPIC-MCM-NEXT: addl (%rbx), %r12d
+; X64-NOPIC-MCM-NEXT: movl %r12d, %eax
; X64-NOPIC-MCM-NEXT: orl %ecx, %eax
; X64-NOPIC-MCM-NEXT: shlq $47, %rcx
; X64-NOPIC-MCM-NEXT: orq %rcx, %rsp
@@ -428,19 +428,19 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
; X64-PIC-NEXT: shlq $47, %rax
; X64-PIC-NEXT: movq %r14, %rdi
; X64-PIC-NEXT: orq %rax, %rsp
-; X64-PIC-NEXT: leaq .Lslh_ret_addr4(%rip), %rbp
+; X64-PIC-NEXT: leaq .Lslh_ret_addr4(%rip), %r12
; X64-PIC-NEXT: callq setjmp at PLT
; X64-PIC-NEXT: .Lslh_ret_addr4:
; X64-PIC-NEXT: movq %rsp, %rax
; X64-PIC-NEXT: sarq $63, %rax
; X64-PIC-NEXT: leaq .Lslh_ret_addr4(%rip), %rcx
-; X64-PIC-NEXT: cmpq %rcx, %rbp
+; X64-PIC-NEXT: cmpq %rcx, %r12
; X64-PIC-NEXT: cmovneq %r15, %rax
-; X64-PIC-NEXT: movl (%rbx), %ebp
-; X64-PIC-NEXT: movl $42, %r12d
+; X64-PIC-NEXT: movl (%rbx), %r12d
+; X64-PIC-NEXT: movl $42, %ebp
; X64-PIC-NEXT: shlq $47, %rax
; X64-PIC-NEXT: movq %r14, %rdi
-; X64-PIC-NEXT: movl %r12d, %esi
+; X64-PIC-NEXT: movl %ebp, %esi
; X64-PIC-NEXT: orq %rax, %rsp
; X64-PIC-NEXT: leaq .Lslh_ret_addr5(%rip), %r13
; X64-PIC-NEXT: callq sigsetjmp at PLT
@@ -450,11 +450,11 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
; X64-PIC-NEXT: leaq .Lslh_ret_addr5(%rip), %rcx
; X64-PIC-NEXT: cmpq %rcx, %r13
; X64-PIC-NEXT: cmovneq %r15, %rax
-; X64-PIC-NEXT: addl (%rbx), %ebp
+; X64-PIC-NEXT: addl (%rbx), %r12d
; X64-PIC-NEXT: shlq $47, %rax
; X64-PIC-NEXT: movq %r14, %rdi
; X64-PIC-NEXT: movq %r14, %rsi
-; X64-PIC-NEXT: movl %r12d, %edx
+; X64-PIC-NEXT: movl %ebp, %edx
; X64-PIC-NEXT: orq %rax, %rsp
; X64-PIC-NEXT: leaq .Lslh_ret_addr6(%rip), %r14
; X64-PIC-NEXT: callq __sigsetjmp at PLT
@@ -465,8 +465,8 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
; X64-PIC-NEXT: cmpq %rcx, %r14
; X64-PIC-NEXT: movq %rax, %rcx
; X64-PIC-NEXT: cmovneq %r15, %rcx
-; X64-PIC-NEXT: addl (%rbx), %ebp
-; X64-PIC-NEXT: movl %ebp, %eax
+; X64-PIC-NEXT: addl (%rbx), %r12d
+; X64-PIC-NEXT: movl %r12d, %eax
; X64-PIC-NEXT: orl %ecx, %eax
; X64-PIC-NEXT: shlq $47, %rcx
; X64-PIC-NEXT: orq %rcx, %rsp
diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening.ll b/llvm/test/CodeGen/X86/speculative-load-hardening.ll
index 315b6f2e6b248..2352bc768ae82 100644
--- a/llvm/test/CodeGen/X86/speculative-load-hardening.ll
+++ b/llvm/test/CodeGen/X86/speculative-load-hardening.ll
@@ -42,16 +42,16 @@ define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, ptr %ptr1, ptr %ptr2,
; X64-NEXT: .cfi_offset %r14, -24
; X64-NEXT: .cfi_offset %r15, -16
; X64-NEXT: movq %rsp, %rax
-; X64-NEXT: movq $-1, %rbx
+; X64-NEXT: movq $-1, %r14
; X64-NEXT: sarq $63, %rax
; X64-NEXT: testl %edi, %edi
; X64-NEXT: jne .LBB1_1
; X64-NEXT: # %bb.2: # %then1
-; X64-NEXT: cmovneq %rbx, %rax
+; X64-NEXT: cmovneq %r14, %rax
; X64-NEXT: testl %esi, %esi
; X64-NEXT: je .LBB1_4
; X64-NEXT: .LBB1_1:
-; X64-NEXT: cmoveq %rbx, %rax
+; X64-NEXT: cmoveq %r14, %rax
; X64-NEXT: .LBB1_8: # %exit
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
@@ -64,24 +64,24 @@ define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, ptr %ptr1, ptr %ptr2,
; X64-NEXT: retq
; X64-NEXT: .LBB1_4: # %then2
; X64-NEXT: .cfi_def_cfa_offset 32
-; X64-NEXT: movq %r8, %r14
-; X64-NEXT: cmovneq %rbx, %rax
+; X64-NEXT: movq %r8, %rbx
+; X64-NEXT: cmovneq %r14, %rax
; X64-NEXT: testl %edx, %edx
; X64-NEXT: je .LBB1_6
; X64-NEXT: # %bb.5: # %else3
-; X64-NEXT: cmoveq %rbx, %rax
+; X64-NEXT: cmoveq %r14, %rax
; X64-NEXT: movslq (%r9), %rcx
; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: leaq (%r14,%rcx,4), %r15
-; X64-NEXT: movl %ecx, (%r14,%rcx,4)
+; X64-NEXT: leaq (%rbx,%rcx,4), %r15
+; X64-NEXT: movl %ecx, (%rbx,%rcx,4)
; X64-NEXT: jmp .LBB1_7
; X64-NEXT: .LBB1_6: # %then3
-; X64-NEXT: cmovneq %rbx, %rax
+; X64-NEXT: cmovneq %r14, %rax
; X64-NEXT: movl (%rcx), %ecx
-; X64-NEXT: addl (%r14), %ecx
+; X64-NEXT: addl (%rbx), %ecx
; X64-NEXT: movslq %ecx, %rdi
; X64-NEXT: orq %rax, %rdi
-; X64-NEXT: movl (%r14,%rdi,4), %esi
+; X64-NEXT: movl (%rbx,%rdi,4), %esi
; X64-NEXT: orl %eax, %esi
; X64-NEXT: movq (%r9), %r15
; X64-NEXT: orq %rax, %r15
@@ -95,11 +95,11 @@ define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, ptr %ptr1, ptr %ptr2,
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr0, %rcx
-; X64-NEXT: cmovneq %rbx, %rax
+; X64-NEXT: cmovneq %r14, %rax
; X64-NEXT: .LBB1_7: # %merge
; X64-NEXT: movslq (%r15), %rcx
; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: movl $0, (%r14,%rcx,4)
+; X64-NEXT: movl $0, (%rbx,%rcx,4)
; X64-NEXT: jmp .LBB1_8
;
; X64-LFENCE-LABEL: test_basic_conditions:
@@ -210,18 +210,18 @@ define void @test_basic_loop(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2) nounwind spec
; X64-NEXT: cmoveq %r15, %rax
; X64-NEXT: jmp .LBB2_5
; X64-NEXT: .LBB2_2: # %l.header.preheader
-; X64-NEXT: movq %rcx, %r14
-; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movl %esi, %ebp
; X64-NEXT: cmovneq %r15, %rax
-; X64-NEXT: xorl %ebx, %ebx
+; X64-NEXT: xorl %r12d, %r12d
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB2_3: # %l.header
; X64-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NEXT: movslq (%r12), %rcx
+; X64-NEXT: movslq (%r14), %rcx
; X64-NEXT: orq %rax, %rcx
; X64-NEXT: movq %rax, %rdx
-; X64-NEXT: orq %r14, %rdx
+; X64-NEXT: orq %rbx, %rdx
; X64-NEXT: movl (%rdx,%rcx,4), %edi
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
@@ -232,8 +232,8 @@ define void @test_basic_loop(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2) nounwind spec
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr1, %rcx
; X64-NEXT: cmovneq %r15, %rax
-; X64-NEXT: incl %ebx
-; X64-NEXT: cmpl %ebp, %ebx
+; X64-NEXT: incl %r12d
+; X64-NEXT: cmpl %ebp, %r12d
; X64-NEXT: jge .LBB2_4
; X64-NEXT: # %bb.6: # in Loop: Header=BB2_3 Depth=1
; X64-NEXT: cmovgeq %r15, %rax
@@ -260,20 +260,20 @@ define void @test_basic_loop(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2) nounwind spec
; X64-LFENCE-NEXT: testl %edi, %edi
; X64-LFENCE-NEXT: jne .LBB2_3
; X64-LFENCE-NEXT: # %bb.1: # %l.header.preheader
-; X64-LFENCE-NEXT: movq %rcx, %r14
-; X64-LFENCE-NEXT: movq %rdx, %r15
+; X64-LFENCE-NEXT: movq %rcx, %rbx
+; X64-LFENCE-NEXT: movq %rdx, %r14
; X64-LFENCE-NEXT: movl %esi, %ebp
; X64-LFENCE-NEXT: lfence
-; X64-LFENCE-NEXT: xorl %ebx, %ebx
+; X64-LFENCE-NEXT: xorl %r15d, %r15d
; X64-LFENCE-NEXT: .p2align 4, 0x90
; X64-LFENCE-NEXT: .LBB2_2: # %l.header
; X64-LFENCE-NEXT: # =>This Inner Loop Header: Depth=1
; X64-LFENCE-NEXT: lfence
-; X64-LFENCE-NEXT: movslq (%r15), %rax
-; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi
+; X64-LFENCE-NEXT: movslq (%r14), %rax
+; X64-LFENCE-NEXT: movl (%rbx,%rax,4), %edi
; X64-LFENCE-NEXT: callq sink at PLT
-; X64-LFENCE-NEXT: incl %ebx
-; X64-LFENCE-NEXT: cmpl %ebp, %ebx
+; X64-LFENCE-NEXT: incl %r15d
+; X64-LFENCE-NEXT: cmpl %ebp, %r15d
; X64-LFENCE-NEXT: jl .LBB2_2
; X64-LFENCE-NEXT: .LBB2_3: # %exit
; X64-LFENCE-NEXT: lfence
@@ -312,34 +312,34 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, ptr %ptr1, ptr %ptr2
; X64-NEXT: pushq %rbx
; X64-NEXT: pushq %rax
; X64-NEXT: movq %rsp, %rax
-; X64-NEXT: movq $-1, %rbp
+; X64-NEXT: movq $-1, %r12
; X64-NEXT: sarq $63, %rax
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB3_2
; X64-NEXT: # %bb.1:
-; X64-NEXT: cmoveq %rbp, %rax
+; X64-NEXT: cmoveq %r12, %rax
; X64-NEXT: jmp .LBB3_10
; X64-NEXT: .LBB3_2: # %l1.header.preheader
-; X64-NEXT: movq %r8, %r14
-; X64-NEXT: movq %rcx, %rbx
-; X64-NEXT: movl %edx, %r12d
+; X64-NEXT: movq %r8, %rbx
+; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movl %edx, %ebp
; X64-NEXT: movl %esi, %r15d
-; X64-NEXT: cmovneq %rbp, %rax
+; X64-NEXT: cmovneq %r12, %rax
; X64-NEXT: xorl %r13d, %r13d
; X64-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: testl %r15d, %r15d
; X64-NEXT: jle .LBB3_4
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB3_5: # %l2.header.preheader
-; X64-NEXT: cmovleq %rbp, %rax
+; X64-NEXT: cmovleq %r12, %rax
; X64-NEXT: xorl %r15d, %r15d
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB3_6: # %l2.header
; X64-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NEXT: movslq (%rbx), %rcx
+; X64-NEXT: movslq (%r14), %rcx
; X64-NEXT: orq %rax, %rcx
; X64-NEXT: movq %rax, %rdx
-; X64-NEXT: orq %r14, %rdx
+; X64-NEXT: orq %rbx, %rdx
; X64-NEXT: movl (%rdx,%rcx,4), %edi
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
@@ -349,26 +349,26 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, ptr %ptr1, ptr %ptr2
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr2, %rcx
-; X64-NEXT: cmovneq %rbp, %rax
+; X64-NEXT: cmovneq %r12, %rax
; X64-NEXT: incl %r15d
-; X64-NEXT: cmpl %r12d, %r15d
+; X64-NEXT: cmpl %ebp, %r15d
; X64-NEXT: jge .LBB3_7
; X64-NEXT: # %bb.11: # in Loop: Header=BB3_6 Depth=1
-; X64-NEXT: cmovgeq %rbp, %rax
+; X64-NEXT: cmovgeq %r12, %rax
; X64-NEXT: jmp .LBB3_6
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB3_7:
-; X64-NEXT: cmovlq %rbp, %rax
+; X64-NEXT: cmovlq %r12, %rax
; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Reload
; X64-NEXT: jmp .LBB3_8
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB3_4:
-; X64-NEXT: cmovgq %rbp, %rax
+; X64-NEXT: cmovgq %r12, %rax
; X64-NEXT: .LBB3_8: # %l1.latch
-; X64-NEXT: movslq (%rbx), %rcx
+; X64-NEXT: movslq (%r14), %rcx
; X64-NEXT: orq %rax, %rcx
; X64-NEXT: movq %rax, %rdx
-; X64-NEXT: orq %r14, %rdx
+; X64-NEXT: orq %rbx, %rdx
; X64-NEXT: movl (%rdx,%rcx,4), %edi
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
@@ -378,17 +378,17 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, ptr %ptr1, ptr %ptr2
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr3, %rcx
-; X64-NEXT: cmovneq %rbp, %rax
+; X64-NEXT: cmovneq %r12, %rax
; X64-NEXT: incl %r13d
; X64-NEXT: cmpl %r15d, %r13d
; X64-NEXT: jge .LBB3_9
; X64-NEXT: # %bb.12:
-; X64-NEXT: cmovgeq %rbp, %rax
+; X64-NEXT: cmovgeq %r12, %rax
; X64-NEXT: testl %r15d, %r15d
; X64-NEXT: jg .LBB3_5
; X64-NEXT: jmp .LBB3_4
; X64-NEXT: .LBB3_9:
-; X64-NEXT: cmovlq %rbp, %rax
+; X64-NEXT: cmovlq %r12, %rax
; X64-NEXT: .LBB3_10: # %exit
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
@@ -423,9 +423,9 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, ptr %ptr1, ptr %ptr2
; X64-LFENCE-NEXT: popq %rbp
; X64-LFENCE-NEXT: retq
; X64-LFENCE-NEXT: .LBB3_1: # %l1.header.preheader
-; X64-LFENCE-NEXT: movq %r8, %r14
-; X64-LFENCE-NEXT: movq %rcx, %rbx
-; X64-LFENCE-NEXT: movl %edx, %r13d
+; X64-LFENCE-NEXT: movq %r8, %rbx
+; X64-LFENCE-NEXT: movq %rcx, %r14
+; X64-LFENCE-NEXT: movl %edx, %ebp
; X64-LFENCE-NEXT: movl %esi, %r15d
; X64-LFENCE-NEXT: lfence
; X64-LFENCE-NEXT: xorl %r12d, %r12d
@@ -434,8 +434,8 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, ptr %ptr1, ptr %ptr2
; X64-LFENCE-NEXT: .LBB3_5: # %l1.latch
; X64-LFENCE-NEXT: # in Loop: Header=BB3_2 Depth=1
; X64-LFENCE-NEXT: lfence
-; X64-LFENCE-NEXT: movslq (%rbx), %rax
-; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi
+; X64-LFENCE-NEXT: movslq (%r14), %rax
+; X64-LFENCE-NEXT: movl (%rbx,%rax,4), %edi
; X64-LFENCE-NEXT: callq sink at PLT
; X64-LFENCE-NEXT: incl %r12d
; X64-LFENCE-NEXT: cmpl %r15d, %r12d
@@ -449,17 +449,17 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, ptr %ptr1, ptr %ptr2
; X64-LFENCE-NEXT: # %bb.3: # %l2.header.preheader
; X64-LFENCE-NEXT: # in Loop: Header=BB3_2 Depth=1
; X64-LFENCE-NEXT: lfence
-; X64-LFENCE-NEXT: xorl %ebp, %ebp
+; X64-LFENCE-NEXT: xorl %r13d, %r13d
; X64-LFENCE-NEXT: .p2align 4, 0x90
; X64-LFENCE-NEXT: .LBB3_4: # %l2.header
; X64-LFENCE-NEXT: # Parent Loop BB3_2 Depth=1
; X64-LFENCE-NEXT: # => This Inner Loop Header: Depth=2
; X64-LFENCE-NEXT: lfence
-; X64-LFENCE-NEXT: movslq (%rbx), %rax
-; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi
+; X64-LFENCE-NEXT: movslq (%r14), %rax
+; X64-LFENCE-NEXT: movl (%rbx,%rax,4), %edi
; X64-LFENCE-NEXT: callq sink at PLT
-; X64-LFENCE-NEXT: incl %ebp
-; X64-LFENCE-NEXT: cmpl %r13d, %ebp
+; X64-LFENCE-NEXT: incl %r13d
+; X64-LFENCE-NEXT: cmpl %ebp, %r13d
; X64-LFENCE-NEXT: jl .LBB3_4
; X64-LFENCE-NEXT: jmp .LBB3_5
entry:
@@ -542,13 +542,13 @@ define void @test_fp_loads(ptr %fptr, ptr %dptr, ptr %i32ptr, ptr %i64ptr) nounw
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
; X64-NEXT: movq %rsp, %rax
-; X64-NEXT: movq %rcx, %r15
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rsi, %rbx
-; X64-NEXT: movq %rdi, %r12
+; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rsi, %r12
+; X64-NEXT: movq %rdi, %r15
; X64-NEXT: movq $-1, %r13
; X64-NEXT: sarq $63, %rax
-; X64-NEXT: orq %rax, %r12
+; X64-NEXT: orq %rax, %r15
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
@@ -559,7 +559,7 @@ define void @test_fp_loads(ptr %fptr, ptr %dptr, ptr %i32ptr, ptr %i64ptr) nounw
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr7, %rcx
; X64-NEXT: cmovneq %r13, %rax
-; X64-NEXT: orq %rax, %rbx
+; X64-NEXT: orq %rax, %r12
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
@@ -592,9 +592,9 @@ define void @test_fp_loads(ptr %fptr, ptr %dptr, ptr %i32ptr, ptr %i64ptr) nounw
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr10, %rcx
; X64-NEXT: cmovneq %r13, %rax
-; X64-NEXT: orq %rax, %r14
+; X64-NEXT: orq %rax, %rbx
; X64-NEXT: xorps %xmm0, %xmm0
-; X64-NEXT: cvtsi2ssl (%r14), %xmm0
+; X64-NEXT: cvtsi2ssl (%rbx), %xmm0
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
; X64-NEXT: callq sink_float at PLT
@@ -604,9 +604,9 @@ define void @test_fp_loads(ptr %fptr, ptr %dptr, ptr %i32ptr, ptr %i64ptr) nounw
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr11, %rcx
; X64-NEXT: cmovneq %r13, %rax
-; X64-NEXT: orq %rax, %r15
+; X64-NEXT: orq %rax, %r14
; X64-NEXT: xorps %xmm0, %xmm0
-; X64-NEXT: cvtsi2sdq (%r15), %xmm0
+; X64-NEXT: cvtsi2sdq (%r14), %xmm0
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
; X64-NEXT: callq sink_double at PLT
@@ -617,7 +617,7 @@ define void @test_fp_loads(ptr %fptr, ptr %dptr, ptr %i32ptr, ptr %i64ptr) nounw
; X64-NEXT: cmpq $.Lslh_ret_addr12, %rcx
; X64-NEXT: cmovneq %r13, %rax
; X64-NEXT: xorps %xmm0, %xmm0
-; X64-NEXT: cvtsi2ssq (%r15), %xmm0
+; X64-NEXT: cvtsi2ssq (%r14), %xmm0
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
; X64-NEXT: callq sink_float at PLT
@@ -628,7 +628,7 @@ define void @test_fp_loads(ptr %fptr, ptr %dptr, ptr %i32ptr, ptr %i64ptr) nounw
; X64-NEXT: cmpq $.Lslh_ret_addr13, %rcx
; X64-NEXT: cmovneq %r13, %rax
; X64-NEXT: xorps %xmm0, %xmm0
-; X64-NEXT: cvtsi2sdl (%r14), %xmm0
+; X64-NEXT: cvtsi2sdl (%rbx), %xmm0
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
; X64-NEXT: callq sink_double at PLT
@@ -654,9 +654,9 @@ define void @test_fp_loads(ptr %fptr, ptr %dptr, ptr %i32ptr, ptr %i64ptr) nounw
; X64-LFENCE-NEXT: pushq %r12
; X64-LFENCE-NEXT: pushq %rbx
; X64-LFENCE-NEXT: pushq %rax
-; X64-LFENCE-NEXT: movq %rcx, %r15
-; X64-LFENCE-NEXT: movq %rdx, %r14
-; X64-LFENCE-NEXT: movq %rsi, %rbx
+; X64-LFENCE-NEXT: movq %rcx, %r14
+; X64-LFENCE-NEXT: movq %rdx, %rbx
+; X64-LFENCE-NEXT: movq %rsi, %r15
; X64-LFENCE-NEXT: movq %rdi, %r12
; X64-LFENCE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-LFENCE-NEXT: callq sink_float at PLT
@@ -669,16 +669,16 @@ define void @test_fp_loads(ptr %fptr, ptr %dptr, ptr %i32ptr, ptr %i64ptr) nounw
; X64-LFENCE-NEXT: cvtss2sd %xmm0, %xmm0
; X64-LFENCE-NEXT: callq sink_double at PLT
; X64-LFENCE-NEXT: xorps %xmm0, %xmm0
-; X64-LFENCE-NEXT: cvtsi2ssl (%r14), %xmm0
+; X64-LFENCE-NEXT: cvtsi2ssl (%rbx), %xmm0
; X64-LFENCE-NEXT: callq sink_float at PLT
; X64-LFENCE-NEXT: xorps %xmm0, %xmm0
-; X64-LFENCE-NEXT: cvtsi2sdq (%r15), %xmm0
+; X64-LFENCE-NEXT: cvtsi2sdq (%r14), %xmm0
; X64-LFENCE-NEXT: callq sink_double at PLT
; X64-LFENCE-NEXT: xorps %xmm0, %xmm0
-; X64-LFENCE-NEXT: cvtsi2ssq (%r15), %xmm0
+; X64-LFENCE-NEXT: cvtsi2ssq (%r14), %xmm0
; X64-LFENCE-NEXT: callq sink_float at PLT
; X64-LFENCE-NEXT: xorps %xmm0, %xmm0
-; X64-LFENCE-NEXT: cvtsi2sdl (%r14), %xmm0
+; X64-LFENCE-NEXT: cvtsi2sdl (%rbx), %xmm0
; X64-LFENCE-NEXT: callq sink_double at PLT
; X64-LFENCE-NEXT: addq $8, %rsp
; X64-LFENCE-NEXT: popq %rbx
@@ -731,11 +731,11 @@ define void @test_vec_loads(ptr %v4f32ptr, ptr %v2f64ptr, ptr %v16i8ptr, ptr %v8
; X64-NEXT: pushq %rbx
; X64-NEXT: pushq %rax
; X64-NEXT: movq %rsp, %rax
-; X64-NEXT: movq %r9, %r14
-; X64-NEXT: movq %r8, %r15
-; X64-NEXT: movq %rcx, %r12
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: movq %r9, %rbx
+; X64-NEXT: movq %r8, %r14
+; X64-NEXT: movq %rcx, %r15
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rsi, %r13
; X64-NEXT: movq $-1, %rbp
; X64-NEXT: sarq $63, %rax
; X64-NEXT: orq %rax, %rdi
@@ -749,8 +749,8 @@ define void @test_vec_loads(ptr %v4f32ptr, ptr %v2f64ptr, ptr %v16i8ptr, ptr %v8
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr15, %rcx
; X64-NEXT: cmovneq %rbp, %rax
-; X64-NEXT: orq %rax, %rbx
-; X64-NEXT: movaps (%rbx), %xmm0
+; X64-NEXT: orq %rax, %r13
+; X64-NEXT: movaps (%r13), %xmm0
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
; X64-NEXT: callq sink_v2f64 at PLT
@@ -760,8 +760,8 @@ define void @test_vec_loads(ptr %v4f32ptr, ptr %v2f64ptr, ptr %v16i8ptr, ptr %v8
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr16, %rcx
; X64-NEXT: cmovneq %rbp, %rax
-; X64-NEXT: orq %rax, %r13
-; X64-NEXT: movaps (%r13), %xmm0
+; X64-NEXT: orq %rax, %r12
+; X64-NEXT: movaps (%r12), %xmm0
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
; X64-NEXT: callq sink_v16i8 at PLT
@@ -771,8 +771,8 @@ define void @test_vec_loads(ptr %v4f32ptr, ptr %v2f64ptr, ptr %v16i8ptr, ptr %v8
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr17, %rcx
; X64-NEXT: cmovneq %rbp, %rax
-; X64-NEXT: orq %rax, %r12
-; X64-NEXT: movaps (%r12), %xmm0
+; X64-NEXT: orq %rax, %r15
+; X64-NEXT: movaps (%r15), %xmm0
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
; X64-NEXT: callq sink_v8i16 at PLT
@@ -782,8 +782,8 @@ define void @test_vec_loads(ptr %v4f32ptr, ptr %v2f64ptr, ptr %v16i8ptr, ptr %v8
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr18, %rcx
; X64-NEXT: cmovneq %rbp, %rax
-; X64-NEXT: orq %rax, %r15
-; X64-NEXT: movaps (%r15), %xmm0
+; X64-NEXT: orq %rax, %r14
+; X64-NEXT: movaps (%r14), %xmm0
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
; X64-NEXT: callq sink_v4i32 at PLT
@@ -793,8 +793,8 @@ define void @test_vec_loads(ptr %v4f32ptr, ptr %v2f64ptr, ptr %v16i8ptr, ptr %v8
; X64-NEXT: sarq $63, %rax
; X64-NEXT: cmpq $.Lslh_ret_addr19, %rcx
; X64-NEXT: cmovneq %rbp, %rax
-; X64-NEXT: orq %rax, %r14
-; X64-NEXT: movaps (%r14), %xmm0
+; X64-NEXT: orq %rax, %rbx
+; X64-NEXT: movaps (%rbx), %xmm0
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
; X64-NEXT: callq sink_v2i64 at PLT
@@ -822,22 +822,22 @@ define void @test_vec_loads(ptr %v4f32ptr, ptr %v2f64ptr, ptr %v16i8ptr, ptr %v8
; X64-LFENCE-NEXT: pushq %r13
; X64-LFENCE-NEXT: pushq %r12
; X64-LFENCE-NEXT: pushq %rbx
-; X64-LFENCE-NEXT: movq %r9, %r14
-; X64-LFENCE-NEXT: movq %r8, %r15
-; X64-LFENCE-NEXT: movq %rcx, %r12
-; X64-LFENCE-NEXT: movq %rdx, %r13
-; X64-LFENCE-NEXT: movq %rsi, %rbx
+; X64-LFENCE-NEXT: movq %r9, %rbx
+; X64-LFENCE-NEXT: movq %r8, %r14
+; X64-LFENCE-NEXT: movq %rcx, %r15
+; X64-LFENCE-NEXT: movq %rdx, %r12
+; X64-LFENCE-NEXT: movq %rsi, %r13
; X64-LFENCE-NEXT: movaps (%rdi), %xmm0
; X64-LFENCE-NEXT: callq sink_v4f32 at PLT
-; X64-LFENCE-NEXT: movaps (%rbx), %xmm0
-; X64-LFENCE-NEXT: callq sink_v2f64 at PLT
; X64-LFENCE-NEXT: movaps (%r13), %xmm0
-; X64-LFENCE-NEXT: callq sink_v16i8 at PLT
+; X64-LFENCE-NEXT: callq sink_v2f64 at PLT
; X64-LFENCE-NEXT: movaps (%r12), %xmm0
-; X64-LFENCE-NEXT: callq sink_v8i16 at PLT
+; X64-LFENCE-NEXT: callq sink_v16i8 at PLT
; X64-LFENCE-NEXT: movaps (%r15), %xmm0
-; X64-LFENCE-NEXT: callq sink_v4i32 at PLT
+; X64-LFENCE-NEXT: callq sink_v8i16 at PLT
; X64-LFENCE-NEXT: movaps (%r14), %xmm0
+; X64-LFENCE-NEXT: callq sink_v4i32 at PLT
+; X64-LFENCE-NEXT: movaps (%rbx), %xmm0
; X64-LFENCE-NEXT: callq sink_v2i64 at PLT
; X64-LFENCE-NEXT: popq %rbx
; X64-LFENCE-NEXT: popq %r12
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index d45662158bbc7..aeb39a1e242d9 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2360,8 +2360,8 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; CHECK-AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4
; CHECK-AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; CHECK-AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm6
@@ -2383,22 +2383,22 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7
; CHECK-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7
-; CHECK-AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm0[9],zero,zero,zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,zero,zero,xmm0[15],zero
-; CHECK-AVX1-NEXT: vpackuswb %xmm5, %xmm7, %xmm5
-; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5
-; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; CHECK-AVX1-NEXT: vpsraw $8, %xmm6, %xmm6
-; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
-; CHECK-AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
-; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-AVX1-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm0[9],zero,zero,zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,zero,zero,xmm0[15],zero
+; CHECK-AVX1-NEXT: vpackuswb %xmm8, %xmm7, %xmm7
+; CHECK-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6
+; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; CHECK-AVX1-NEXT: vpsraw $8, %xmm7, %xmm7
; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
-; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6
-; CHECK-AVX1-NEXT: vpsrlw $7, %xmm5, %xmm5
-; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5
-; CHECK-AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
-; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5
+; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-AVX1-NEXT: vpsraw $8, %xmm8, %xmm8
+; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8, %xmm8
+; CHECK-AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8
+; CHECK-AVX1-NEXT: vpackuswb %xmm7, %xmm8, %xmm7
+; CHECK-AVX1-NEXT: vpsrlw $7, %xmm6, %xmm6
+; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; CHECK-AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5
+; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm7, %xmm5
; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6
diff --git a/llvm/test/CodeGen/X86/sse-intel-ocl.ll b/llvm/test/CodeGen/X86/sse-intel-ocl.ll
index 2c517d24c939a..b2de7545ff5f5 100644
--- a/llvm/test/CodeGen/X86/sse-intel-ocl.ll
+++ b/llvm/test/CodeGen/X86/sse-intel-ocl.ll
@@ -42,14 +42,14 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
; WIN64-NEXT: movaps (%r8), %xmm2
; WIN64-NEXT: movaps (%rdx), %xmm1
; WIN64-NEXT: movaps (%rcx), %xmm0
-; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rax
; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; WIN64-NEXT: addps (%rax), %xmm0
+; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; WIN64-NEXT: addps (%r8), %xmm0
; WIN64-NEXT: addps (%rdx), %xmm1
; WIN64-NEXT: addps (%rcx), %xmm2
-; WIN64-NEXT: addps (%r8), %xmm3
+; WIN64-NEXT: addps (%rax), %xmm3
; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
; WIN64-NEXT: callq func_float16_ptr
; WIN64-NEXT: addps {{[0-9]+}}(%rsp), %xmm0
diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll
index 09b16bccec40c..0226052402cb8 100644
--- a/llvm/test/CodeGen/X86/sse-regcall.ll
+++ b/llvm/test/CodeGen/X86/sse-regcall.ll
@@ -243,11 +243,8 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
;
; WIN64-LABEL: testi32_inp:
; WIN64: # %bb.0:
-; WIN64-NEXT: pushq %r13
-; WIN64-NEXT: pushq %rbp
; WIN64-NEXT: pushq %rbx
; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
-; WIN64-NEXT: movl %ecx, %ebx
; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15
; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14
@@ -257,45 +254,40 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9
; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8
; WIN64-NEXT: # kill: def $edi killed $edi def $rdi
-; WIN64-NEXT: leal (%rdx,%rdi), %r13d
+; WIN64-NEXT: leal (%rdx,%rdi), %ebx
; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx
; WIN64-NEXT: subl %edi, %edx
-; WIN64-NEXT: leal (%rsi,%r8), %ecx
+; WIN64-NEXT: leal (%rsi,%r8), %edi
; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi
; WIN64-NEXT: subl %r8d, %esi
; WIN64-NEXT: leal (%r9,%r10), %r8d
-; WIN64-NEXT: movl %r9d, %ebp
-; WIN64-NEXT: subl %r10d, %ebp
-; WIN64-NEXT: movl %eax, %edi
-; WIN64-NEXT: movl %ebx, %r9d
-; WIN64-NEXT: subl %ebx, %edi
-; WIN64-NEXT: imull %edi, %ebp
-; WIN64-NEXT: leal (%r11,%r12), %edi
-; WIN64-NEXT: movl %r11d, %ebx
-; WIN64-NEXT: subl %r12d, %ebx
-; WIN64-NEXT: imull %edx, %ebx
-; WIN64-NEXT: addl %ebp, %ebx
+; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9
+; WIN64-NEXT: subl %r10d, %r9d
+; WIN64-NEXT: movl %eax, %r10d
+; WIN64-NEXT: subl %ecx, %r10d
+; WIN64-NEXT: imull %r10d, %r9d
+; WIN64-NEXT: leal (%r11,%r12), %r10d
+; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11
+; WIN64-NEXT: subl %r12d, %r11d
+; WIN64-NEXT: imull %edx, %r11d
+; WIN64-NEXT: addl %r9d, %r11d
; WIN64-NEXT: leal (%r14,%r15), %edx
-; WIN64-NEXT: movl %r14d, %ebp
-; WIN64-NEXT: subl %r15d, %ebp
-; WIN64-NEXT: imull %esi, %ebp
-; WIN64-NEXT: addl %ebx, %ebp
-; WIN64-NEXT: addl %r9d, %eax
+; WIN64-NEXT: movl %r14d, %r9d
+; WIN64-NEXT: subl %r15d, %r9d
+; WIN64-NEXT: imull %esi, %r9d
+; WIN64-NEXT: addl %r11d, %r9d
+; WIN64-NEXT: addl %ecx, %eax
; WIN64-NEXT: imull %r8d, %eax
-; WIN64-NEXT: imull %r13d, %edi
-; WIN64-NEXT: addl %edi, %eax
-; WIN64-NEXT: imull %ecx, %edx
+; WIN64-NEXT: imull %ebx, %r10d
+; WIN64-NEXT: addl %r10d, %eax
+; WIN64-NEXT: imull %edi, %edx
; WIN64-NEXT: addl %edx, %eax
-; WIN64-NEXT: addl %ebp, %eax
+; WIN64-NEXT: addl %r9d, %eax
; WIN64-NEXT: popq %rbx
-; WIN64-NEXT: popq %rbp
-; WIN64-NEXT: popq %r13
; WIN64-NEXT: retq
;
; LINUXOSX-LABEL: testi32_inp:
; LINUXOSX: # %bb.0:
-; LINUXOSX-NEXT: pushq %rbp
-; LINUXOSX-NEXT: pushq %rbx
; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx
; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX-NEXT: # kill: def $r14d killed $r14d def $r14
@@ -305,37 +297,35 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8
; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi
; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d
-; LINUXOSX-NEXT: movl %edx, %ebp
-; LINUXOSX-NEXT: subl %edi, %ebp
-; LINUXOSX-NEXT: leal (%rsi,%r8), %r11d
+; LINUXOSX-NEXT: # kill: def $edx killed $edx killed $rdx
+; LINUXOSX-NEXT: subl %edi, %edx
+; LINUXOSX-NEXT: leal (%rsi,%r8), %edi
; LINUXOSX-NEXT: # kill: def $esi killed $esi killed $rsi
; LINUXOSX-NEXT: subl %r8d, %esi
; LINUXOSX-NEXT: leal (%r9,%r12), %r8d
-; LINUXOSX-NEXT: movl %r9d, %edi
-; LINUXOSX-NEXT: subl %r12d, %edi
-; LINUXOSX-NEXT: movl %eax, %edx
-; LINUXOSX-NEXT: subl %ecx, %edx
-; LINUXOSX-NEXT: imull %edx, %edi
-; LINUXOSX-NEXT: leal (%r13,%r14), %edx
-; LINUXOSX-NEXT: movl %r13d, %ebx
-; LINUXOSX-NEXT: subl %r14d, %ebx
-; LINUXOSX-NEXT: imull %ebp, %ebx
-; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %ebp
-; LINUXOSX-NEXT: addl %edi, %ebx
-; LINUXOSX-NEXT: movl %r15d, %edi
-; LINUXOSX-NEXT: subl %ebp, %edi
-; LINUXOSX-NEXT: imull %esi, %edi
-; LINUXOSX-NEXT: addl %ebx, %edi
+; LINUXOSX-NEXT: # kill: def $r9d killed $r9d killed $r9
+; LINUXOSX-NEXT: subl %r12d, %r9d
+; LINUXOSX-NEXT: movl %eax, %r11d
+; LINUXOSX-NEXT: subl %ecx, %r11d
+; LINUXOSX-NEXT: imull %r11d, %r9d
+; LINUXOSX-NEXT: leal (%r13,%r14), %r11d
+; LINUXOSX-NEXT: movl %r13d, %r12d
+; LINUXOSX-NEXT: subl %r14d, %r12d
+; LINUXOSX-NEXT: imull %edx, %r12d
+; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; LINUXOSX-NEXT: addl %r9d, %r12d
+; LINUXOSX-NEXT: movl %r15d, %r9d
+; LINUXOSX-NEXT: subl %edx, %r9d
+; LINUXOSX-NEXT: imull %esi, %r9d
+; LINUXOSX-NEXT: addl %r12d, %r9d
; LINUXOSX-NEXT: addl %ecx, %eax
; LINUXOSX-NEXT: imull %r8d, %eax
-; LINUXOSX-NEXT: imull %r10d, %edx
+; LINUXOSX-NEXT: imull %r10d, %r11d
+; LINUXOSX-NEXT: addl %r11d, %eax
+; LINUXOSX-NEXT: addl %r15d, %edx
+; LINUXOSX-NEXT: imull %edi, %edx
; LINUXOSX-NEXT: addl %edx, %eax
-; LINUXOSX-NEXT: addl %r15d, %ebp
-; LINUXOSX-NEXT: imull %r11d, %ebp
-; LINUXOSX-NEXT: addl %ebp, %eax
-; LINUXOSX-NEXT: addl %edi, %eax
-; LINUXOSX-NEXT: popq %rbx
-; LINUXOSX-NEXT: popq %rbp
+; LINUXOSX-NEXT: addl %r9d, %eax
; LINUXOSX-NEXT: retq
i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind {
%x1 = sub i32 %a1, %a2
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 348fe27616479..c0f0b1f3d43d9 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -3533,10 +3533,10 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a
;
; X64-AVX1-LABEL: test_mm_set_epi8:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x48]
-; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x50]
-; X64-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0]
-; X64-AVX1-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01]
+; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x48]
+; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x50]
+; X64-AVX1-NEXT: vmovd %r10d, %xmm0 # encoding: [0xc4,0xc1,0x79,0x6e,0xc2]
+; X64-AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40]
; X64-AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38]
@@ -3569,10 +3569,10 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a
;
; X64-AVX512-LABEL: test_mm_set_epi8:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x48]
-; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x50]
-; X64-AVX512-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; X64-AVX512-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01]
+; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x48]
+; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x50]
+; X64-AVX512-NEXT: vmovd %r10d, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc2]
+; X64-AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40]
; X64-AVX512-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38]
@@ -3671,10 +3671,10 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a
;
; X32-AVX1-LABEL: test_mm_set_epi8:
; X32-AVX1: # %bb.0:
-; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x48]
-; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50]
-; X32-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0]
-; X32-AVX1-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01]
+; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x48]
+; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x50]
+; X32-AVX1-NEXT: vmovd %r10d, %xmm0 # encoding: [0xc4,0xc1,0x79,0x6e,0xc2]
+; X32-AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40]
; X32-AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38]
@@ -3707,10 +3707,10 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a
;
; X32-AVX512-LABEL: test_mm_set_epi8:
; X32-AVX512: # %bb.0:
-; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x48]
-; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50]
-; X32-AVX512-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; X32-AVX512-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01]
+; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x48]
+; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x50]
+; X32-AVX512-NEXT: vmovd %r10d, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc2]
+; X32-AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40]
; X32-AVX512-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38]
@@ -3837,8 +3837,8 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4,
;
; X64-SSE-LABEL: test_mm_set_epi16:
; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x10]
-; X64-SSE-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x08]
+; X64-SSE-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10]
+; X64-SSE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x08]
; X64-SSE-NEXT: movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7]
; X64-SSE-NEXT: movd %esi, %xmm1 # encoding: [0x66,0x0f,0x6e,0xce]
; X64-SSE-NEXT: punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8]
@@ -3853,8 +3853,8 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4,
; X64-SSE-NEXT: movd %r9d, %xmm1 # encoding: [0x66,0x41,0x0f,0x6e,0xc9]
; X64-SSE-NEXT: punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8]
; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
-; X64-SSE-NEXT: movd %r10d, %xmm0 # encoding: [0x66,0x41,0x0f,0x6e,0xc2]
+; X64-SSE-NEXT: movd %r10d, %xmm3 # encoding: [0x66,0x41,0x0f,0x6e,0xda]
+; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
; X64-SSE-NEXT: punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3]
; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; X64-SSE-NEXT: punpckldq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x62,0xc1]
@@ -3893,8 +3893,8 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4,
;
; X32-SSE-LABEL: test_mm_set_epi16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x10]
-; X32-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x08]
+; X32-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10]
+; X32-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x08]
; X32-SSE-NEXT: movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7]
; X32-SSE-NEXT: movd %esi, %xmm1 # encoding: [0x66,0x0f,0x6e,0xce]
; X32-SSE-NEXT: punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8]
@@ -3909,8 +3909,8 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4,
; X32-SSE-NEXT: movd %r9d, %xmm1 # encoding: [0x66,0x41,0x0f,0x6e,0xc9]
; X32-SSE-NEXT: punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8]
; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
-; X32-SSE-NEXT: movd %r10d, %xmm0 # encoding: [0x66,0x41,0x0f,0x6e,0xc2]
+; X32-SSE-NEXT: movd %r10d, %xmm3 # encoding: [0x66,0x41,0x0f,0x6e,0xda]
+; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
; X32-SSE-NEXT: punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3]
; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; X32-SSE-NEXT: punpckldq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x62,0xc1]
@@ -5282,30 +5282,30 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4
;
; X64-AVX1-LABEL: test_mm_setr_epi16:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x10]
-; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x08]
+; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10]
+; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x08]
; X64-AVX1-NEXT: vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7]
; X64-AVX1-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc6,0x01]
; X64-AVX1-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc2,0x02]
; X64-AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x03]
; X64-AVX1-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04]
; X64-AVX1-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05]
-; X64-AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; X64-AVX1-NEXT: vpinsrw $7, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x07]
+; X64-AVX1-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06]
+; X64-AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
; X64-AVX1-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512-LABEL: test_mm_setr_epi16:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x10]
-; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x08]
+; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10]
+; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x08]
; X64-AVX512-NEXT: vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
; X64-AVX512-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc6,0x01]
; X64-AVX512-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x02]
; X64-AVX512-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x03]
; X64-AVX512-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04]
; X64-AVX512-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05]
-; X64-AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; X64-AVX512-NEXT: vpinsrw $7, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x07]
+; X64-AVX512-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06]
+; X64-AVX512-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
; X64-AVX512-NEXT: retq # encoding: [0xc3]
;
; X32-SSE-LABEL: test_mm_setr_epi16:
@@ -5338,30 +5338,30 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4
;
; X32-AVX1-LABEL: test_mm_setr_epi16:
; X32-AVX1: # %bb.0:
-; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x10]
-; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x08]
+; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10]
+; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x08]
; X32-AVX1-NEXT: vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7]
; X32-AVX1-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc6,0x01]
; X32-AVX1-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc2,0x02]
; X32-AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x03]
; X32-AVX1-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04]
; X32-AVX1-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05]
-; X32-AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; X32-AVX1-NEXT: vpinsrw $7, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x07]
+; X32-AVX1-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06]
+; X32-AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
; X32-AVX1-NEXT: retq # encoding: [0xc3]
;
; X32-AVX512-LABEL: test_mm_setr_epi16:
; X32-AVX512: # %bb.0:
-; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x10]
-; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x08]
+; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10]
+; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x08]
; X32-AVX512-NEXT: vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
; X32-AVX512-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc6,0x01]
; X32-AVX512-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x02]
; X32-AVX512-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x03]
; X32-AVX512-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04]
; X32-AVX512-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05]
-; X32-AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; X32-AVX512-NEXT: vpinsrw $7, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x07]
+; X32-AVX512-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06]
+; X32-AVX512-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
; X32-AVX512-NEXT: retq # encoding: [0xc3]
%res0 = insertelement <8 x i16> undef, i16 %a0, i32 0
%res1 = insertelement <8 x i16> %res0, i16 %a1, i32 1
diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll
index 2b87e17a0b5e3..e5ea911d4771a 100644
--- a/llvm/test/CodeGen/X86/sshl_sat.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat.ll
@@ -162,16 +162,16 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
; X64-NEXT: shlb $4, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shlb %cl, %al
-; X64-NEXT: movzbl %al, %esi
-; X64-NEXT: movl %esi, %edx
+; X64-NEXT: movzbl %al, %edx
+; X64-NEXT: movl %edx, %esi
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: sarb %cl, %dl
+; X64-NEXT: sarb %cl, %sil
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: testb %dil, %dil
; X64-NEXT: sets %al
; X64-NEXT: addl $127, %eax
-; X64-NEXT: cmpb %dl, %dil
-; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: cmpb %sil, %dil
+; X64-NEXT: cmovel %edx, %eax
; X64-NEXT: sarb $4, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
@@ -357,16 +357,16 @@ define i8 @func8(i8 %x, i8 %y) nounwind {
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shlb %cl, %al
-; X64-NEXT: movzbl %al, %esi
-; X64-NEXT: movl %esi, %edx
+; X64-NEXT: movzbl %al, %edx
+; X64-NEXT: movl %edx, %esi
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: sarb %cl, %dl
+; X64-NEXT: sarb %cl, %sil
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: testb %dil, %dil
; X64-NEXT: sets %al
; X64-NEXT: addl $127, %eax
-; X64-NEXT: cmpb %dl, %dil
-; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: cmpb %sil, %dil
+; X64-NEXT: cmovel %edx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 42f346c0fd558..111b7e763e189 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -967,46 +967,46 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pandn %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: pandn %xmm9, %xmm10
; SSE2-NEXT: psrad $31, %xmm9
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm10, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm4, %xmm9
; SSE2-NEXT: pand %xmm9, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psubd %xmm5, %xmm4
+; SSE2-NEXT: por %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: psubd %xmm5, %xmm9
; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
; SSE2-NEXT: pxor %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm4, %xmm5
-; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm9, %xmm5
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: pxor %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psubd %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psubd %xmm6, %xmm5
; SSE2-NEXT: pcmpgtd %xmm8, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
; SSE2-NEXT: pxor %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pandn %xmm4, %xmm5
-; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: por %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psubd %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psubd %xmm7, %xmm5
; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
; SSE2-NEXT: pxor %xmm7, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pandn %xmm4, %xmm5
-; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: por %xmm6, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v16i32:
@@ -1017,77 +1017,77 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pandn %xmm9, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm10
+; SSSE3-NEXT: pandn %xmm9, %xmm10
; SSSE3-NEXT: psrad $31, %xmm9
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: pxor %xmm10, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: pxor %xmm4, %xmm9
; SSSE3-NEXT: pand %xmm9, %xmm0
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: psubd %xmm5, %xmm4
+; SSSE3-NEXT: por %xmm10, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm9
+; SSSE3-NEXT: psubd %xmm5, %xmm9
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
; SSSE3-NEXT: pxor %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm4, %xmm5
-; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: pandn %xmm9, %xmm5
+; SSSE3-NEXT: psrad $31, %xmm9
+; SSSE3-NEXT: pxor %xmm4, %xmm9
+; SSSE3-NEXT: pand %xmm9, %xmm1
; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: psubd %xmm6, %xmm4
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: psubd %xmm6, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
; SSSE3-NEXT: pxor %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pandn %xmm4, %xmm5
-; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: por %xmm5, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: psubd %xmm7, %xmm4
+; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: pandn %xmm5, %xmm6
+; SSSE3-NEXT: psrad $31, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: por %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: psubd %xmm7, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3
; SSSE3-NEXT: pxor %xmm7, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pandn %xmm4, %xmm5
-; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: por %xmm5, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: pandn %xmm5, %xmm6
+; SSSE3-NEXT: psrad $31, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: por %xmm6, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v16i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm3, %xmm8
+; SSE41-NEXT: movdqa %xmm3, %xmm11
; SSE41-NEXT: movdqa %xmm2, %xmm10
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pxor %xmm11, %xmm11
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: psubd %xmm4, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm11, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm9
+; SSE41-NEXT: pxor %xmm12, %xmm12
+; SSE41-NEXT: movdqa %xmm0, %xmm8
+; SSE41-NEXT: psubd %xmm4, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: movdqa %xmm9, %xmm1
+; SSE41-NEXT: movdqa %xmm8, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm9
-; SSE41-NEXT: movdqa %xmm3, %xmm1
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm8
+; SSE41-NEXT: movdqa %xmm9, %xmm1
; SSE41-NEXT: psubd %xmm5, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm11, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT: pxor %xmm5, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm9
+; SSE41-NEXT: pxor %xmm5, %xmm9
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm10, %xmm2
; SSE41-NEXT: psubd %xmm6, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm11, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm6
; SSE41-NEXT: pcmpgtd %xmm2, %xmm10
; SSE41-NEXT: pxor %xmm6, %xmm10
; SSE41-NEXT: movdqa %xmm2, %xmm3
@@ -1095,17 +1095,17 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; SSE41-NEXT: pxor %xmm4, %xmm3
; SSE41-NEXT: movdqa %xmm10, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm11, %xmm3
; SSE41-NEXT: psubd %xmm7, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm11, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm8
-; SSE41-NEXT: pxor %xmm7, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm11
+; SSE41-NEXT: pxor %xmm7, %xmm11
; SSE41-NEXT: movdqa %xmm3, %xmm5
; SSE41-NEXT: psrad $31, %xmm5
; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: movdqa %xmm11, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3
-; SSE41-NEXT: movaps %xmm9, %xmm0
+; SSE41-NEXT: movaps %xmm8, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v16i32:
@@ -1341,61 +1341,61 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE2-LABEL: v4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: psubq %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
; SSE2-NEXT: movdqa %xmm5, %xmm7
; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm0, %xmm6
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: psubq %xmm3, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: psubq %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm5, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: psrad $31, %xmm1
@@ -1407,61 +1407,61 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
;
; SSSE3-LABEL: v4i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm8, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
; SSSE3-NEXT: psubq %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pxor %xmm8, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
; SSSE3-NEXT: movdqa %xmm5, %xmm7
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm2
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm2
+; SSSE3-NEXT: pand %xmm7, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pandn %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pandn %xmm0, %xmm6
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: psubq %xmm3, %xmm1
+; SSSE3-NEXT: por %xmm6, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pxor %xmm8, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: psubq %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: movdqa %xmm5, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pxor %xmm8, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm4
+; SSSE3-NEXT: pxor %xmm6, %xmm4
; SSSE3-NEXT: movdqa %xmm4, %xmm3
; SSSE3-NEXT: pandn %xmm1, %xmm3
; SSSE3-NEXT: psrad $31, %xmm1
@@ -1474,25 +1474,25 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE41-LABEL: v4i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648]
+; SSE41-NEXT: pxor %xmm6, %xmm0
; SSE41-NEXT: psubq %xmm2, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pxor %xmm9, %xmm5
+; SSE41-NEXT: pxor %xmm6, %xmm5
; SSE41-NEXT: movdqa %xmm0, %xmm7
; SSE41-NEXT: pcmpeqd %xmm5, %xmm7
; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm6
-; SSE41-NEXT: por %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm9, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm8
+; SSE41-NEXT: por %xmm0, %xmm8
+; SSE41-NEXT: pxor %xmm6, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
; SSE41-NEXT: pand %xmm0, %xmm5
; SSE41-NEXT: por %xmm2, %xmm5
-; SSE41-NEXT: pxor %xmm6, %xmm5
+; SSE41-NEXT: pxor %xmm8, %xmm5
; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807]
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
; SSE41-NEXT: movapd %xmm7, %xmm2
@@ -1501,24 +1501,24 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm0
; SSE41-NEXT: psubq %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm9, %xmm2
+; SSE41-NEXT: pxor %xmm6, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: por %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm9, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm9
+; SSE41-NEXT: por %xmm0, %xmm9
+; SSE41-NEXT: pxor %xmm6, %xmm3
; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm0, %xmm2
; SSE41-NEXT: por %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm6, %xmm2
+; SSE41-NEXT: pxor %xmm9, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
; SSE41-NEXT: movdqa %xmm2, %xmm0
@@ -1611,62 +1611,62 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSE2-NEXT: por %xmm12, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pxor %xmm10, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
; SSE2-NEXT: pandn %xmm0, %xmm10
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm9, %xmm0
; SSE2-NEXT: por %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: psubq %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm10
; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: psubq %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm10, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm9
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm10
+; SSE2-NEXT: por %xmm9, %xmm10
; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: pand %xmm11, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm10, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm5
; SSE2-NEXT: pandn %xmm1, %xmm5
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm9, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: psubq %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: psubq %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
; SSE2-NEXT: pand %xmm11, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm9
; SSE2-NEXT: pxor %xmm8, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm5
; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
@@ -1676,44 +1676,44 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE2-NEXT: pand %xmm10, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: psubq %xmm7, %xmm3
+; SSE2-NEXT: por %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm4
+; SSE2-NEXT: psubq %xmm7, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: movdqa %xmm7, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: pand %xmm9, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm3, %xmm6
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: por %xmm6, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i64:
@@ -1737,62 +1737,62 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm11, %xmm12
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSSE3-NEXT: por %xmm12, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm11, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm9
+; SSSE3-NEXT: pxor %xmm10, %xmm9
+; SSSE3-NEXT: movdqa %xmm9, %xmm10
; SSSE3-NEXT: pandn %xmm0, %xmm10
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT: pxor %xmm9, %xmm0
-; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: pand %xmm9, %xmm0
; SSSE3-NEXT: por %xmm10, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm9
+; SSSE3-NEXT: pxor %xmm8, %xmm9
+; SSSE3-NEXT: psubq %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm10
; SSSE3-NEXT: pxor %xmm8, %xmm10
-; SSSE3-NEXT: psubq %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: movdqa %xmm10, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11
+; SSSE3-NEXT: movdqa %xmm9, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11
; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm12, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSSE3-NEXT: pand %xmm12, %xmm9
; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm10
+; SSSE3-NEXT: por %xmm9, %xmm10
; SSSE3-NEXT: pxor %xmm8, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm5, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSSE3-NEXT: pand %xmm11, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm9
+; SSSE3-NEXT: pxor %xmm10, %xmm9
+; SSSE3-NEXT: movdqa %xmm9, %xmm5
; SSSE3-NEXT: pandn %xmm1, %xmm5
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm10
-; SSSE3-NEXT: pxor %xmm8, %xmm10
-; SSSE3-NEXT: psubq %xmm6, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm5
; SSSE3-NEXT: pxor %xmm8, %xmm5
-; SSSE3-NEXT: movdqa %xmm10, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: psubq %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm9
+; SSSE3-NEXT: pxor %xmm8, %xmm9
+; SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
; SSSE3-NEXT: pand %xmm11, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm9
; SSSE3-NEXT: pxor %xmm8, %xmm6
; SSSE3-NEXT: movdqa %xmm6, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
@@ -1802,44 +1802,44 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSSE3-NEXT: pand %xmm10, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pandn %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pandn %xmm2, %xmm6
; SSSE3-NEXT: psrad $31, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: por %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: psubq %xmm7, %xmm3
+; SSSE3-NEXT: por %xmm6, %xmm2
; SSSE3-NEXT: movdqa %xmm3, %xmm5
; SSSE3-NEXT: pxor %xmm8, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm4
+; SSSE3-NEXT: psubq %xmm7, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: pxor %xmm8, %xmm6
+; SSSE3-NEXT: movdqa %xmm5, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm10, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
; SSSE3-NEXT: pxor %xmm8, %xmm7
-; SSSE3-NEXT: movdqa %xmm7, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm7, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm5
+; SSSE3-NEXT: pxor %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pandn %xmm3, %xmm6
; SSSE3-NEXT: psrad $31, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm3
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: por %xmm5, %xmm3
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: por %xmm6, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i64:
@@ -1900,12 +1900,12 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: psubq %xmm6, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm9, %xmm5
-; SSE41-NEXT: por %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm9
+; SSE41-NEXT: por %xmm0, %xmm9
; SSE41-NEXT: pxor %xmm10, %xmm6
; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pcmpeqd %xmm10, %xmm0
@@ -1913,7 +1913,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm0, %xmm4
; SSE41-NEXT: por %xmm6, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm4
+; SSE41-NEXT: pxor %xmm9, %xmm4
; SSE41-NEXT: movapd %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5
@@ -2023,66 +2023,62 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
; SSE-LABEL: v2i128:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbx
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rcx
; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT: seto %r10b
-; SSE-NEXT: movq %r8, %rbx
-; SSE-NEXT: sarq $63, %rbx
-; SSE-NEXT: testb %r10b, %r10b
-; SSE-NEXT: cmovneq %rbx, %rcx
+; SSE-NEXT: seto %dil
+; SSE-NEXT: movq %r8, %r10
+; SSE-NEXT: sarq $63, %r10
+; SSE-NEXT: testb %dil, %dil
+; SSE-NEXT: cmovneq %r10, %rcx
; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
-; SSE-NEXT: xorq %r11, %rbx
-; SSE-NEXT: testb %r10b, %r10b
-; SSE-NEXT: cmoveq %r8, %rbx
+; SSE-NEXT: xorq %r11, %r10
+; SSE-NEXT: testb %dil, %dil
+; SSE-NEXT: cmoveq %r8, %r10
; SSE-NEXT: subq %r9, %rsi
; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT: seto %r8b
-; SSE-NEXT: movq %rdx, %rdi
-; SSE-NEXT: sarq $63, %rdi
-; SSE-NEXT: testb %r8b, %r8b
-; SSE-NEXT: cmovneq %rdi, %rsi
-; SSE-NEXT: xorq %r11, %rdi
-; SSE-NEXT: testb %r8b, %r8b
-; SSE-NEXT: cmoveq %rdx, %rdi
+; SSE-NEXT: seto %dil
+; SSE-NEXT: movq %rdx, %r8
+; SSE-NEXT: sarq $63, %r8
+; SSE-NEXT: testb %dil, %dil
+; SSE-NEXT: cmovneq %r8, %rsi
+; SSE-NEXT: xorq %r11, %r8
+; SSE-NEXT: testb %dil, %dil
+; SSE-NEXT: cmoveq %rdx, %r8
; SSE-NEXT: movq %rcx, 16(%rax)
; SSE-NEXT: movq %rsi, (%rax)
-; SSE-NEXT: movq %rbx, 24(%rax)
-; SSE-NEXT: movq %rdi, 8(%rax)
-; SSE-NEXT: popq %rbx
+; SSE-NEXT: movq %r10, 24(%rax)
+; SSE-NEXT: movq %r8, 8(%rax)
; SSE-NEXT: retq
;
; AVX-LABEL: v2i128:
; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbx
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rcx
; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
-; AVX-NEXT: seto %r10b
-; AVX-NEXT: movq %r8, %rbx
-; AVX-NEXT: sarq $63, %rbx
-; AVX-NEXT: testb %r10b, %r10b
-; AVX-NEXT: cmovneq %rbx, %rcx
+; AVX-NEXT: seto %dil
+; AVX-NEXT: movq %r8, %r10
+; AVX-NEXT: sarq $63, %r10
+; AVX-NEXT: testb %dil, %dil
+; AVX-NEXT: cmovneq %r10, %rcx
; AVX-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
-; AVX-NEXT: xorq %r11, %rbx
-; AVX-NEXT: testb %r10b, %r10b
-; AVX-NEXT: cmoveq %r8, %rbx
+; AVX-NEXT: xorq %r11, %r10
+; AVX-NEXT: testb %dil, %dil
+; AVX-NEXT: cmoveq %r8, %r10
; AVX-NEXT: subq %r9, %rsi
; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
-; AVX-NEXT: seto %r8b
-; AVX-NEXT: movq %rdx, %rdi
-; AVX-NEXT: sarq $63, %rdi
-; AVX-NEXT: testb %r8b, %r8b
-; AVX-NEXT: cmovneq %rdi, %rsi
-; AVX-NEXT: xorq %r11, %rdi
-; AVX-NEXT: testb %r8b, %r8b
-; AVX-NEXT: cmoveq %rdx, %rdi
+; AVX-NEXT: seto %dil
+; AVX-NEXT: movq %rdx, %r8
+; AVX-NEXT: sarq $63, %r8
+; AVX-NEXT: testb %dil, %dil
+; AVX-NEXT: cmovneq %r8, %rsi
+; AVX-NEXT: xorq %r11, %r8
+; AVX-NEXT: testb %dil, %dil
+; AVX-NEXT: cmoveq %rdx, %r8
; AVX-NEXT: movq %rcx, 16(%rax)
; AVX-NEXT: movq %rsi, (%rax)
-; AVX-NEXT: movq %rbx, 24(%rax)
-; AVX-NEXT: movq %rdi, 8(%rax)
-; AVX-NEXT: popq %rbx
+; AVX-NEXT: movq %r10, 24(%rax)
+; AVX-NEXT: movq %r8, 8(%rax)
; AVX-NEXT: retq
%z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
ret <2 x i128> %z
diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
index a8efb841a2b8d..09d003d5ab8e4 100644
--- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
+++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
@@ -328,7 +328,6 @@ body: |
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: TEST64rr [[COPY2]], [[COPY2]], implicit-def $eflags
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[COPY2]]
- ; CHECK-NEXT: [[MOV32ri1:%[0-9]+]]:gr32 = MOV32ri -1
; CHECK-NEXT: JCC_1 %bb.9, 4, implicit killed $eflags
; CHECK-NEXT: JMP_1 %bb.6
; CHECK-NEXT: {{ $}}
@@ -342,8 +341,8 @@ body: |
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY [[NOT64r1]]
; CHECK-NEXT: [[OR32ri:%[0-9]+]]:gr32 = OR32ri [[OR32ri]], 268435456, implicit-def dead $eflags
; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr32 = COPY [[OR32ri]]
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[COPY3]]
; CHECK-NEXT: [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[COPY3]]
; CHECK-NEXT: undef %81.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.7.bb33:
@@ -353,9 +352,9 @@ body: |
; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm undef %59:gr64, 1, $noreg, 0, $noreg :: (load unordered (s64) from `i8 addrspace(1)* addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: [[NOT64r2:%[0-9]+]]:gr64 = NOT64r [[NOT64r2]]
; CHECK-NEXT: CMP64rr [[NOT64r2]], [[COPY6]], implicit-def $eflags
- ; CHECK-NEXT: undef %102.sub_32bit:gr64_with_sub_8bit = MOV32ri 0
- ; CHECK-NEXT: [[CMOV64rr:%[0-9]+]]:gr64 = CMOV64rr [[CMOV64rr]], %102, 4, implicit killed $eflags
- ; CHECK-NEXT: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %102, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+ ; CHECK-NEXT: undef %100.sub_32bit:gr64_with_sub_8bit = MOV32ri 0
+ ; CHECK-NEXT: [[CMOV64rr:%[0-9]+]]:gr64 = CMOV64rr [[CMOV64rr]], %100, 4, implicit killed $eflags
+ ; CHECK-NEXT: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %100, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
; CHECK-NEXT: LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, [[COPY5]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: $rdi = COPY [[COPY4]]
@@ -383,7 +382,7 @@ body: |
; CHECK-NEXT: bb.9.bb64:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: LCMPXCHG32 undef %76:gr64, 1, $noreg, 0, $noreg, [[MOV32ri1]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: LCMPXCHG32 undef %76:gr64, 1, $noreg, 0, $noreg, [[MOV32ri]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: STATEPOINT 2, 5, 1, undef %79:gr64, undef $rdi, 2, 0, 2, 0, 2, 27, 2, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 133, 2, 0, 2, 5, 2, 1, 2, 7, 2, 0, 2, 8, 2, 2, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 8, 2, 2, 2, 0, 2, 0, 2, 0, csr_64, implicit-def $rsp, implicit-def $ssp
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir
index 142a749a6a443..8af409ecddf06 100644
--- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir
+++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir
@@ -212,8 +212,7 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
- ; CHECK-NEXT: undef %75.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
- ; CHECK-NEXT: MOV64mr %stack.2, 1, $noreg, 0, $noreg, %75 :: (store (s64) into %stack.2)
+ ; CHECK-NEXT: undef %39.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
; CHECK-NEXT: STATEPOINT 2, 5, 2, undef %24:gr64, $rdi, undef $rsi, 2, 0, 2, 0, 2, 37, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 6, 2, 0, 2, 4, 2, 1, 2, 0, 2, 0, 2, 7, 2, 0, 2, 0, 2, 0, 2, 7, 2, 0, 2, 0, 2, 0, 2, 2, 2, 4, 2, 5, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, 2, 0, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
@@ -238,9 +237,7 @@ body: |
; CHECK-NEXT: $esi = COPY %66.sub_32bit
; CHECK-NEXT: $edx = COPY [[LEA64_32r]]
; CHECK-NEXT: $r8d = COPY [[MOV32rm]]
- ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2)
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY [[MOV64rm]]
- ; CHECK-NEXT: [[STATEPOINT:%[0-9]+]]:gr64, [[STATEPOINT1:%[0-9]+]]:gr64, [[STATEPOINT2:%[0-9]+]]:gr64, [[STATEPOINT3:%[0-9]+]]:gr64 = STATEPOINT 2, 5, 5, undef %35:gr64, $rdi, $esi, $edx, undef $rcx, $r8d, 2, 0, 2, 0, 2, 85, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 63, 2, 0, 2, 9, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, [[STATEPOINT1]], 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 4278124286, 2, 3, 1, 4, %stack.1, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 4, [[STATEPOINT]](tied-def 0), [[STATEPOINT1]](tied-def 1), [[STATEPOINT2]](tied-def 2), [[STATEPOINT3]](tied-def 3), 2, 0, 2, 4, 0, 0, 1, 1, 2, 2, 3, 3, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0), (volatile load store (s32) on %stack.1)
+ ; CHECK-NEXT: [[STATEPOINT:%[0-9]+]]:gr64, [[STATEPOINT1:%[0-9]+]]:gr64, [[STATEPOINT2:%[0-9]+]]:gr64, [[STATEPOINT3:%[0-9]+]]:gr64_with_sub_8bit = STATEPOINT 2, 5, 5, undef %35:gr64, $rdi, $esi, $edx, undef $rcx, $r8d, 2, 0, 2, 0, 2, 85, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 63, 2, 0, 2, 9, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, [[STATEPOINT1]], 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 4278124286, 2, 3, 1, 4, %stack.1, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 4, [[STATEPOINT]](tied-def 0), [[STATEPOINT1]](tied-def 1), [[STATEPOINT2]](tied-def 2), [[STATEPOINT3]](tied-def 3), 2, 0, 2, 4, 0, 0, 1, 1, 2, 2, 3, 3, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0), (volatile load store (s32) on %stack.1)
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CMP32rr %65.sub_32bit, undef %37:gr32, implicit-def $eflags
; CHECK-NEXT: JCC_1 %bb.4, 13, implicit killed $eflags
@@ -248,10 +245,8 @@ body: |
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY [[STATEPOINT3]]
- ; CHECK-NEXT: MOV64mr %stack.2, 1, $noreg, 0, $noreg, [[COPY]] :: (store (s64) into %stack.2)
; CHECK-NEXT: [[DEF1:%[0-9]+]]:gr64 = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY [[LEA64_32r]]
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY [[LEA64_32r]]
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3.bb21:
; CHECK-NEXT: successors: %bb.1(0x80000000)
@@ -265,8 +260,8 @@ body: |
; CHECK-NEXT: EH_LABEL <mcsymbol .Ltmp0>
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: $ecx = MOV32r0 implicit-def dead $eflags
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY [[LEA64_32r]]
- ; CHECK-NEXT: [[STATEPOINT2]]:gr64, [[STATEPOINT3]]:gr64, [[STATEPOINT]]:gr64, dead [[STATEPOINT1]]:gr64 = STATEPOINT 1, 16, 5, undef %47:gr64, undef $edi, undef $rsi, undef $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 99, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, [[STATEPOINT1]], 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 8, 2, 12, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 5, [[STATEPOINT2]](tied-def 0), [[STATEPOINT3]](tied-def 1), [[STATEPOINT]](tied-def 2), [[STATEPOINT1]](tied-def 3), 2, 4278124286, 2, 0, 2, 5, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY [[LEA64_32r]]
+ ; CHECK-NEXT: [[STATEPOINT2]]:gr64, [[STATEPOINT3]]:gr64_with_sub_8bit, [[STATEPOINT]]:gr64, dead [[STATEPOINT1]]:gr64 = STATEPOINT 1, 16, 5, undef %47:gr64, undef $edi, undef $rsi, undef $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 99, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, [[STATEPOINT1]], 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 8, 2, 12, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 5, [[STATEPOINT2]](tied-def 0), [[STATEPOINT3]](tied-def 1), [[STATEPOINT]](tied-def 2), [[STATEPOINT1]](tied-def 3), 2, 4278124286, 2, 0, 2, 5, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0)
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: EH_LABEL <mcsymbol .Ltmp1>
; CHECK-NEXT: JMP_1 %bb.5
@@ -274,9 +269,7 @@ body: |
; CHECK-NEXT: bb.5.bb30:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[STATEPOINT3]]
- ; CHECK-NEXT: MOV64mr %stack.2, 1, $noreg, 0, $noreg, [[COPY3]] :: (store (s64) into %stack.2)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[STATEPOINT2]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY [[STATEPOINT2]]
; CHECK-NEXT: [[ADD64ri8_:%[0-9]+]]:gr64 = nuw ADD64ri8 [[ADD64ri8_]], 28, implicit-def dead $eflags
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @barney, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp
@@ -292,7 +285,7 @@ body: |
; CHECK-NEXT: EH_LABEL <mcsymbol .Ltmp2>
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: $edi = MOV32ri 3
- ; CHECK-NEXT: dead [[STATEPOINT3]]:gr64, dead [[DEF]]:gr64 = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @wombat, $edi, 2, 0, 2, 2, 2, 97, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 3, [[COPY2]], 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 0, 2, 12, 2, 51, 2, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 3, [[STATEPOINT3]](tied-def 0), 2, 4278124286, [[DEF]](tied-def 1), 2, 0, 2, 3, 0, 0, 1, 1, 2, 2, csr_64, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: dead [[STATEPOINT3]]:gr64_with_sub_8bit, dead [[DEF]]:gr64 = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @wombat, $edi, 2, 0, 2, 2, 2, 97, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 3, [[COPY1]], 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 0, 2, 12, 2, 51, 2, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 3, [[STATEPOINT3]](tied-def 0), 2, 4278124286, [[DEF]](tied-def 1), 2, 0, 2, 3, 0, 0, 1, 1, 2, 2, csr_64, implicit-def $rsp, implicit-def $ssp
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
bb.0.bb:
successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir
index ed8e5a962508d..15b5ba26868b9 100644
--- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir
+++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir
@@ -273,7 +273,7 @@ body: |
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm undef %17:gr64, 1, $noreg, 0, $noreg :: (load unordered (s64) from `i8 addrspace(1)* addrspace(1)* undef`, addrspace 1)
- ; CHECK-NEXT: [[NOT64r:%[0-9]+]]:gr64 = NOT64r [[NOT64r]]
+ ; CHECK-NEXT: [[NOT64r:%[0-9]+]]:gr64 = NOT64r [[MOV64rm]]
; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, [[NOT64r]] :: (store (s64) into %stack.1)
; CHECK-NEXT: undef %48.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
; CHECK-NEXT: [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF
@@ -332,8 +332,7 @@ body: |
; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %stack.1)
; CHECK-NEXT: dead $edx = MOV32r0 implicit-def dead $eflags, implicit-def $rdx
; CHECK-NEXT: $ecx = MOV32r0 implicit-def dead $eflags
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[DEF2]]
- ; CHECK-NEXT: dead [[MOV64rm]]:gr64, dead [[COPY1]]:gr64, dead [[DEF1]]:gr64, dead [[DEF]]:gr64 = STATEPOINT 1, 16, 5, undef %41:gr64, undef $edi, undef $rsi, $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 89, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[COPY1]], 2, 7, 2, 0, 2, 0, [[COPY1]], 2, 10, 2, 2, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[COPY1]], 2, 0, [[COPY1]], 2, 7, 2, 0, 2, 0, [[COPY1]], 2, 2, 2, 8, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 12, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 18, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, [[DEF1]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 8, 2, 9, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, [[DEF1]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 5, [[MOV64rm]](tied-def 0), [[COPY1]](tied-def 1), [[DEF1]](tied-def 2), 2, 4278124286, [[DEF]](tied-def 3), 2, 0, 2, 5, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0)
+ ; CHECK-NEXT: dead [[MOV64rm]]:gr64, dead [[DEF2]]:gr64_with_sub_8bit, dead [[DEF1]]:gr64, dead [[DEF]]:gr64 = STATEPOINT 1, 16, 5, undef %41:gr64, undef $edi, undef $rsi, $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 89, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF2]], 2, 7, 2, 0, 2, 0, [[DEF2]], 2, 10, 2, 2, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[DEF2]], 2, 0, [[DEF2]], 2, 7, 2, 0, 2, 0, [[DEF2]], 2, 2, 2, 8, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 12, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 18, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, [[DEF1]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 8, 2, 9, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, [[DEF1]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 5, [[MOV64rm]](tied-def 0), [[DEF2]](tied-def 1), [[DEF1]](tied-def 2), 2, 4278124286, [[DEF]](tied-def 3), 2, 0, 2, 5, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0)
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: EH_LABEL <mcsymbol .Ltmp1>
; CHECK-NEXT: JMP_1 %bb.10
@@ -342,14 +341,14 @@ body: |
; CHECK-NEXT: successors: %bb.11(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[DEF3:%[0-9]+]]:gr64 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF3:%[0-9]+]]:gr64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:gr64 = IMPLICIT_DEF
; CHECK-NEXT: undef [[DEF2]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.11.bb27:
; CHECK-NEXT: successors: %bb.2(0x80000000), %bb.17(0x00000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: TEST32rr [[ADD32rr]], [[ADD32rr]], implicit-def $eflags
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY [[ADD32rr]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY [[ADD32rr]]
; CHECK-NEXT: JCC_1 %bb.2, 8, implicit $eflags
; CHECK-NEXT: JMP_1 %bb.17
; CHECK-NEXT: {{ $}}
@@ -387,13 +386,12 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: $edi = MOV32ri -39
- ; CHECK-NEXT: STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 103, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 10, 2, 2, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, 2, 8, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 12, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 18, 2, 63, 2, 0, 2, 9, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 3, [[COPY2]], 2, 3, 2, 0, 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, 2, 33, 2, 6, 2, 0, 2, 5, 2, 0, 2, 0, 2, 4278124286, 2, 3, [[COPY2]], 2, 3, 2, 4278124286, 2, 0, 2, 4278124286, 2, 3, 2, 4278124286, 2, 1, 2, 34, 2, 14, 2, 0, 2, 3, 2, 0, 2, 3, [[COPY2]], 2, 3, 2, 4278124286, 2, 3, 2, 0, 2, 1, 2, 4278124286, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 103, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 10, 2, 2, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, 2, 8, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 12, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 18, 2, 63, 2, 0, 2, 9, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 3, [[COPY1]], 2, 3, 2, 0, 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, 2, 33, 2, 6, 2, 0, 2, 5, 2, 0, 2, 0, 2, 4278124286, 2, 3, [[COPY1]], 2, 3, 2, 4278124286, 2, 0, 2, 4278124286, 2, 3, 2, 4278124286, 2, 1, 2, 34, 2, 14, 2, 0, 2, 3, 2, 0, 2, 3, [[COPY1]], 2, 3, 2, 4278124286, 2, 3, 2, 0, 2, 1, 2, 4278124286, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.17.bb44:
; CHECK-NEXT: successors: %bb.22(0x40000000), %bb.18(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[DEF2]]
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @hoge.1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
@@ -420,7 +418,7 @@ body: |
; CHECK-NEXT: EH_LABEL <mcsymbol .Ltmp3>
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: $ecx = MOV32r0 implicit-def dead $eflags
- ; CHECK-NEXT: [[STATEPOINT:%[0-9]+]]:gr64 = STATEPOINT 1, 16, 5, undef %60:gr64, undef $edi, undef $rsi, undef $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 45, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 8, 2, 9, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, [[STATEPOINT]](tied-def 0), 2, 4278124286, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: [[STATEPOINT:%[0-9]+]]:gr64_with_sub_8bit = STATEPOINT 1, 16, 5, undef %60:gr64, undef $edi, undef $rsi, undef $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 45, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 8, 2, 9, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, [[STATEPOINT]](tied-def 0), 2, 4278124286, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: EH_LABEL <mcsymbol .Ltmp4>
; CHECK-NEXT: JMP_1 %bb.21
@@ -435,7 +433,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: $edi = MOV32ri 10
- ; CHECK-NEXT: dead [[STATEPOINT]]:gr64 = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 45, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 1, 2, 9, 2, 6, 2, 1, 2, 3, 2, 0, 2, 0, 2, 0, 2, 0, 2, 4278124286, 2, 0, 2, 0, 2, 7, 2, 0, 2, 3, [[STATEPOINT]](tied-def 0), 2, 0, 2, 4278124286, 2, 0, 2, 3, 0, 0, 1, 1, 2, 2, csr_64, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: dead [[STATEPOINT]]:gr64_with_sub_8bit = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 45, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 1, 2, 9, 2, 6, 2, 1, 2, 3, 2, 0, 2, 0, 2, 0, 2, 0, 2, 4278124286, 2, 0, 2, 0, 2, 7, 2, 0, 2, 3, [[STATEPOINT]](tied-def 0), 2, 0, 2, 4278124286, 2, 0, 2, 3, 0, 0, 1, 1, 2, 2, csr_64, implicit-def $rsp, implicit-def $ssp
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.23.bb52 (landing-pad):
@@ -445,13 +443,13 @@ body: |
; CHECK-NEXT: EH_LABEL <mcsymbol .Ltmp5>
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: $edi = MOV32ri 3
- ; CHECK-NEXT: dead [[STATEPOINT]]:gr64 = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 43, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 0, 2, 9, 2, 51, 2, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, [[STATEPOINT]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: dead [[STATEPOINT]]:gr64_with_sub_8bit = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 43, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 0, 2, 9, 2, 51, 2, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, [[STATEPOINT]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.24.bb56:
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: $edi = MOV32ri 10
- ; CHECK-NEXT: dead [[STATEPOINT]]:gr64 = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 33, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 9, 2, 2, 2, 26, 2, 1, 2, 3, 2, 1, 2, 0, 2, 0, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 2, [[STATEPOINT]](tied-def 0), 2, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: dead [[STATEPOINT]]:gr64_with_sub_8bit = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 33, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 9, 2, 2, 2, 26, 2, 1, 2, 3, 2, 1, 2, 0, 2, 0, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 2, [[STATEPOINT]](tied-def 0), 2, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
bb.0.bb:
successors: %bb.1(0x80000000), %bb.12(0x00000000)
diff --git a/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll b/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll
index 2ddf22e762098..68874074812e3 100644
--- a/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll
+++ b/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll
@@ -31,18 +31,14 @@ define void @test(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movl %r9d, %r14d
-; CHECK-NEXT: movl %r8d, %r15d
-; CHECK-NEXT: movl %ecx, %r12d
-; CHECK-NEXT: movl %edx, %r13d
-; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl %r9d, %ebx
+; CHECK-NEXT: movl %r8d, %ebp
+; CHECK-NEXT: movl %ecx, %r14d
+; CHECK-NEXT: movl %edx, %r15d
+; CHECK-NEXT: movl %esi, %r12d
+; CHECK-NEXT: movl %edi, %r13d
; CHECK-NEXT: movabsq $_bar, %rax
; CHECK-NEXT: callq *%rax
-; CHECK-NEXT: movl %ebp, %eax
-; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl %ebx, %eax
-; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl %r13d, %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl %r12d, %eax
@@ -51,6 +47,10 @@ define void @test(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl %r14d, %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT: movl %ebp, %eax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -63,9 +63,9 @@ define void @test(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r12d
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r14d
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r11d
@@ -77,8 +77,8 @@ define void @test(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: movabsq $_bar, %r13
-; CHECK-NEXT: callq *%r13 ## 96-byte Folded Reload
+; CHECK-NEXT: movabsq $_bar, %r15
+; CHECK-NEXT: callq *%r15 ## 96-byte Folded Reload
; CHECK-NEXT: Ltmp0:
; CHECK-NEXT: addq $104, %rsp
; CHECK-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll
index 0647c0b0e7bf2..2b02656071a7b 100644
--- a/llvm/test/CodeGen/X86/statepoint-live-in.ll
+++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll
@@ -372,12 +372,12 @@ define void @test10(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movl %r9d, %r15d
-; CHECK-NEXT: movl %r8d, %r14d
-; CHECK-NEXT: movl %ecx, %r12d
-; CHECK-NEXT: movl %edx, %r13d
-; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl %r9d, %ebp
+; CHECK-NEXT: movl %r8d, %ebx
+; CHECK-NEXT: movl %ecx, %r14d
+; CHECK-NEXT: movl %edx, %r15d
+; CHECK-NEXT: movl %esi, %r12d
+; CHECK-NEXT: movl %edi, %r13d
; CHECK-NEXT: callq _bar
; CHECK-NEXT: Ltmp11:
; CHECK-NEXT: callq _bar
diff --git a/llvm/test/CodeGen/X86/statepoint-ra-no-ls.ll b/llvm/test/CodeGen/X86/statepoint-ra-no-ls.ll
index 7ca436cdb9126..82a2f05760309 100644
--- a/llvm/test/CodeGen/X86/statepoint-ra-no-ls.ll
+++ b/llvm/test/CodeGen/X86/statepoint-ra-no-ls.ll
@@ -30,28 +30,28 @@ define void @test(ptr addrspace(1) %b) gc "statepoint-example" {
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movq (%rdi), %r14
+; CHECK-NEXT: movq (%rdi), %rbx
; CHECK-NEXT: movq 8(%rdi), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq 16(%rdi), %r15
-; CHECK-NEXT: movq 24(%rdi), %r12
-; CHECK-NEXT: movq 32(%rdi), %r13
-; CHECK-NEXT: movq 40(%rdi), %rbx
+; CHECK-NEXT: movq 16(%rdi), %r14
+; CHECK-NEXT: movq 24(%rdi), %r15
+; CHECK-NEXT: movq 32(%rdi), %r12
+; CHECK-NEXT: movq 40(%rdi), %r13
; CHECK-NEXT: movq 48(%rdi), %rbp
-; CHECK-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: callq foo at PLT # 8-byte Folded Reload
; CHECK-NEXT: .Ltmp0:
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
; CHECK-NEXT: movq %rbp, %rdi
; CHECK-NEXT: callq bar at PLT
-; CHECK-NEXT: movq %rbx, %rdi
-; CHECK-NEXT: callq bar at PLT
; CHECK-NEXT: movq %r13, %rdi
; CHECK-NEXT: callq bar at PLT
; CHECK-NEXT: movq %r12, %rdi
; CHECK-NEXT: callq bar at PLT
; CHECK-NEXT: movq %r15, %rdi
; CHECK-NEXT: callq bar at PLT
+; CHECK-NEXT: movq %r14, %rdi
+; CHECK-NEXT: callq bar at PLT
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; CHECK-NEXT: callq bar at PLT
; CHECK-NEXT: addq $24, %rsp
diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll
index a30c4c697114e..9338da5de8001 100644
--- a/llvm/test/CodeGen/X86/statepoint-regs.ll
+++ b/llvm/test/CodeGen/X86/statepoint-regs.ll
@@ -75,12 +75,12 @@ define void @test3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movl %r9d, %r14d
-; CHECK-NEXT: movl %r8d, %r15d
-; CHECK-NEXT: movl %ecx, %r12d
-; CHECK-NEXT: movl %edx, %r13d
-; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl %r9d, %ebx
+; CHECK-NEXT: movl %r8d, %ebp
+; CHECK-NEXT: movl %ecx, %r14d
+; CHECK-NEXT: movl %edx, %r15d
+; CHECK-NEXT: movl %esi, %r12d
+; CHECK-NEXT: movl %edi, %r13d
; CHECK-NEXT: callq _bar
; CHECK-NEXT: Ltmp3:
; CHECK-NEXT: addq $8, %rsp
@@ -123,12 +123,12 @@ define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movl %r9d, %r14d
-; CHECK-NEXT: movl %r8d, %r15d
-; CHECK-NEXT: movl %ecx, %r12d
-; CHECK-NEXT: movl %edx, %r13d
-; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl %r9d, %ebx
+; CHECK-NEXT: movl %r8d, %ebp
+; CHECK-NEXT: movl %ecx, %r14d
+; CHECK-NEXT: movl %edx, %r15d
+; CHECK-NEXT: movl %esi, %r12d
+; CHECK-NEXT: movl %edi, %r13d
; CHECK-NEXT: callq _bar
; CHECK-NEXT: Ltmp4:
; CHECK-NEXT: addq $8, %rsp
@@ -234,12 +234,12 @@ define void @test7(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl %edi, %r13d
-; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %r12d
+; CHECK-NEXT: movl %esi, %r13d
; CHECK-NEXT: movl %edx, %ebp
-; CHECK-NEXT: movl %ecx, %r14d
-; CHECK-NEXT: movl %r8d, %r15d
-; CHECK-NEXT: movl %r9d, %r12d
+; CHECK-NEXT: movl %ecx, %ebx
+; CHECK-NEXT: movl %r8d, %r14d
+; CHECK-NEXT: movl %r9d, %r15d
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -330,10 +330,10 @@ define void @test8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; CHECK-NEXT: movl %r8d, (%rsp) ## 4-byte Spill
-; CHECK-NEXT: movl %ecx, %r12d
-; CHECK-NEXT: movl %edx, %r13d
-; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl %ecx, %r14d
+; CHECK-NEXT: movl %edx, %r15d
+; CHECK-NEXT: movl %esi, %r12d
+; CHECK-NEXT: movl %edi, %r13d
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -366,8 +366,8 @@ define void @test8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r14d
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp
; CHECK-NEXT: callq _bar ## 132-byte Folded Reload
; CHECK-NEXT: Ltmp10:
; CHECK-NEXT: addq $136, %rsp
@@ -434,12 +434,12 @@ define void @test9(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movl %r9d, %r14d
-; CHECK-NEXT: movl %r8d, %r15d
-; CHECK-NEXT: movl %ecx, %r12d
-; CHECK-NEXT: movl %edx, %r13d
-; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl %r9d, %ebx
+; CHECK-NEXT: movl %r8d, %ebp
+; CHECK-NEXT: movl %ecx, %r14d
+; CHECK-NEXT: movl %edx, %r15d
+; CHECK-NEXT: movl %esi, %r12d
+; CHECK-NEXT: movl %edi, %r13d
; CHECK-NEXT: callq _bar
; CHECK-NEXT: Ltmp11:
; CHECK-NEXT: addq $8, %rsp
@@ -484,12 +484,12 @@ define void @test10(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movl %r9d, %r15d
-; CHECK-NEXT: movl %r8d, %r14d
-; CHECK-NEXT: movl %ecx, %r12d
-; CHECK-NEXT: movl %edx, %r13d
-; CHECK-NEXT: movl %esi, %ebx
-; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl %r9d, %ebp
+; CHECK-NEXT: movl %r8d, %ebx
+; CHECK-NEXT: movl %ecx, %r14d
+; CHECK-NEXT: movl %edx, %r15d
+; CHECK-NEXT: movl %esi, %r12d
+; CHECK-NEXT: movl %edi, %r13d
; CHECK-NEXT: callq _bar
; CHECK-NEXT: Ltmp12:
; CHECK-NEXT: callq _bar
diff --git a/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll b/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll
index fd05d7ea21df5..f7e053d384c99 100644
--- a/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll
+++ b/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll
@@ -15,14 +15,14 @@ define i1 @test_spill_slot_size(i1 %a1, i2 %a2, i7 %a7, i8 %a8, i9 %a9, i15 %a15
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movl %edi, %ebx
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-NEXT: movw %di, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movl %eax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r11, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT: movw %r11w, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r10d, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: andb $3, %sil
; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
diff --git a/llvm/test/CodeGen/X86/statepoint-stack-usage.ll b/llvm/test/CodeGen/X86/statepoint-stack-usage.ll
index 23373cfe52723..59c7098624ac0 100644
--- a/llvm/test/CodeGen/X86/statepoint-stack-usage.ll
+++ b/llvm/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -65,17 +65,17 @@ define i32 @back_to_back_deopt(i32 %a, i32 %b, i32 %c) #1
; CHECK-DAG: movl %esi, 8(%rsp)
; CHECK-DAG: movl %edx, 4(%rsp)
; CHECK: callq
-; CHECK-DAG: movl %ebx, 12(%rsp)
+; CHECK-DAG: movl %r14d, 12(%rsp)
; CHECK-DAG: movl %ebp, 8(%rsp)
-; CHECK-DAG: movl %r14d, 4(%rsp)
+; CHECK-DAG: movl %ebx, 4(%rsp)
; CHECK: callq
-; CHECK-DAG: movl %ebx, 12(%rsp)
+; CHECK-DAG: movl %r14d, 12(%rsp)
; CHECK-DAG: movl %ebp, 8(%rsp)
-; CHECK-DAG: movl %r14d, 4(%rsp)
+; CHECK-DAG: movl %ebx, 4(%rsp)
; CHECK: callq
-; CHECK-DAG: movl %ebx, 12(%rsp)
+; CHECK-DAG: movl %r14d, 12(%rsp)
; CHECK-DAG: movl %ebp, 8(%rsp)
-; CHECK-DAG: movl %r14d, 4(%rsp)
+; CHECK-DAG: movl %ebx, 4(%rsp)
; CHECK: callq
call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)]
call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)]
diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-details.ll b/llvm/test/CodeGen/X86/statepoint-vreg-details.ll
index 5bc6d43be76e5..3e63abdcd2e25 100644
--- a/llvm/test/CodeGen/X86/statepoint-vreg-details.ll
+++ b/llvm/test/CodeGen/X86/statepoint-vreg-details.ll
@@ -59,15 +59,15 @@ define void @test_mixed(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(
; CHECK-VREG: CALL64pcrel32 @consume5, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $rdx, implicit $rcx, implicit $r8, implicit-def $rsp, implicit-def $ssp
; CHECK-PREG-LABEL: name: test_mixed
-; CHECK-PREG: renamable $r14 = COPY $rdx
-; CHECK-PREG: renamable $r15 = COPY $rsi
-; CHECK-PREG: renamable $rbx = COPY $rdi
-; CHECK-PREG: renamable $r14, renamable $r15, renamable $rbx = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 4, killed renamable $r14(tied-def 0), 2, 0, killed renamable $r15(tied-def 1), killed renamable $rbx(tied-def 2), 2, 0, 2, 4, 0, 0, 1, 1, 2, 2, 3, 3, csr_64, implicit-def $rsp, implicit-def $ssp
-; CHECK-PREG: $rdi = COPY killed renamable $rbx
+; CHECK-PREG: renamable $rbx = COPY $rdx
+; CHECK-PREG: renamable $r14 = COPY $rsi
+; CHECK-PREG: renamable $r15 = COPY $rdi
+; CHECK-PREG: renamable $rbx, renamable $r14, renamable $r15 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 4, killed renamable $rbx(tied-def 0), 2, 0, killed renamable $r14(tied-def 1), killed renamable $r15(tied-def 2), 2, 0, 2, 4, 0, 0, 1, 1, 2, 2, 3, 3, csr_64, implicit-def $rsp, implicit-def $ssp
+; CHECK-PREG: $rdi = COPY killed renamable $r15
; CHECK-PREG: dead $esi = MOV32r0 implicit-def dead $eflags, implicit-def $rsi
-; CHECK-PREG: $rdx = COPY killed renamable $r15
+; CHECK-PREG: $rdx = COPY killed renamable $r14
; CHECK-PREG: dead $ecx = MOV32r0 implicit-def dead $eflags, implicit-def $rcx
-; CHECK-PREG: $r8 = COPY killed renamable $r14
+; CHECK-PREG: $r8 = COPY killed renamable $rbx
; CHECK-PREG: CALL64pcrel32 @consume5, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $rdx, implicit killed $rcx, implicit killed $r8, implicit-def $rsp, implicit-def $ssp
entry:
diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-invoke.ll b/llvm/test/CodeGen/X86/statepoint-vreg-invoke.ll
index 1b6c9d5b7b913..8ed7829f43184 100644
--- a/llvm/test/CodeGen/X86/statepoint-vreg-invoke.ll
+++ b/llvm/test/CodeGen/X86/statepoint-vreg-invoke.ll
@@ -66,58 +66,58 @@ define ptr addrspace(1) @test_invoke_same_val(i1 %cond, ptr addrspace(1) %val1,
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; CHECK-NEXT: liveins: $edi, $rcx, $rdx, $rsi
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $rbx = COPY $rcx
- ; CHECK-NEXT: renamable $rbp = COPY $rdx
- ; CHECK-NEXT: renamable $r14d = COPY $edi
- ; CHECK-NEXT: TEST8ri renamable $r14b, 1, implicit-def $eflags
+ ; CHECK-NEXT: renamable $r14 = COPY $rcx
+ ; CHECK-NEXT: renamable $r15 = COPY $rdx
+ ; CHECK-NEXT: renamable $ebx = COPY $edi
+ ; CHECK-NEXT: TEST8ri renamable $bl, 1, implicit-def $eflags
; CHECK-NEXT: JCC_1 %bb.3, 4, implicit killed $eflags
; CHECK-NEXT: JMP_1 %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1.left:
; CHECK-NEXT: successors: %bb.2(0x7ffff800), %bb.6(0x00000800)
- ; CHECK-NEXT: liveins: $rbp, $rsi, $r14d
+ ; CHECK-NEXT: liveins: $ebx, $rsi, $r15
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, renamable $rsi :: (store (s64) into %stack.0)
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: $rdi = COPY killed renamable $rsi
- ; CHECK-NEXT: renamable $rbp = STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 0, 2, 2, killed renamable $rbp(tied-def 0), 1, 8, %stack.0, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s64) on %stack.0)
+ ; CHECK-NEXT: renamable $r15 = STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 0, 2, 2, killed renamable $r15(tied-def 0), 1, 8, %stack.0, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s64) on %stack.0)
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: JMP_1 %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.left.relocs:
; CHECK-NEXT: successors: %bb.5(0x80000000)
- ; CHECK-NEXT: liveins: $rbp, $r14d
+ ; CHECK-NEXT: liveins: $ebx, $r15
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $rbx = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
+ ; CHECK-NEXT: renamable $r14 = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
; CHECK-NEXT: JMP_1 %bb.5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3.right:
; CHECK-NEXT: successors: %bb.4(0x7ffff800), %bb.7(0x00000800)
- ; CHECK-NEXT: liveins: $rbp, $rbx, $rsi, $r14d
+ ; CHECK-NEXT: liveins: $ebx, $rsi, $r14, $r15
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rbp :: (store (s64) into %stack.0)
+ ; CHECK-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $r15 :: (store (s64) into %stack.0)
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: $rdi = COPY killed renamable $rsi
- ; CHECK-NEXT: renamable $rbx = STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 0, 2, 2, killed renamable $rbx(tied-def 0), 1, 8, %stack.0, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s64) on %stack.0)
+ ; CHECK-NEXT: renamable $r14 = STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 0, 2, 2, killed renamable $r14(tied-def 0), 1, 8, %stack.0, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s64) on %stack.0)
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: JMP_1 %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4.right.relocs:
; CHECK-NEXT: successors: %bb.5(0x80000000)
- ; CHECK-NEXT: liveins: $rbx, $r14d
+ ; CHECK-NEXT: liveins: $ebx, $r14
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $rbp = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
+ ; CHECK-NEXT: renamable $r15 = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.5.normal_return:
- ; CHECK-NEXT: liveins: $rbp, $rbx, $r14d
+ ; CHECK-NEXT: liveins: $ebx, $r14, $r15
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: TEST8ri renamable $r14b, 1, implicit-def $eflags, implicit killed $r14d
- ; CHECK-NEXT: renamable $rbx = CMOV64rr killed renamable $rbx, killed renamable $rbp, 4, implicit killed $eflags
- ; CHECK-NEXT: $rax = COPY killed renamable $rbx
+ ; CHECK-NEXT: TEST8ri renamable $bl, 1, implicit-def $eflags, implicit killed $ebx
+ ; CHECK-NEXT: renamable $r14 = CMOV64rr killed renamable $r14, killed renamable $r15, 4, implicit killed $eflags
+ ; CHECK-NEXT: $rax = COPY killed renamable $r14
; CHECK-NEXT: RET 0, $rax
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.6.exceptional_return.left (landing-pad):
diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll
index aace5052ca0bd..0594f2fbc0a35 100644
--- a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll
+++ b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll
@@ -77,13 +77,13 @@ define i32 @test_spill(
; CHECK-PREG-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.1)
; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.5, align 16)
; CHECK-PREG-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.0)
- ; CHECK-PREG-NEXT: renamable $rbx = MOV64rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.4)
- ; CHECK-PREG-NEXT: renamable $r13 = MOV64rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
- ; CHECK-PREG-NEXT: renamable $r12 = MOV64rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2)
- ; CHECK-PREG-NEXT: renamable $r14 = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.1, align 16)
- ; CHECK-PREG-NEXT: renamable $r15 = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0)
+ ; CHECK-PREG-NEXT: renamable $r13 = MOV64rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.4)
+ ; CHECK-PREG-NEXT: renamable $r12 = MOV64rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
+ ; CHECK-PREG-NEXT: renamable $r15 = MOV64rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2)
+ ; CHECK-PREG-NEXT: renamable $rbx = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.1, align 16)
+ ; CHECK-PREG-NEXT: renamable $r14 = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0)
; CHECK-PREG-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
- ; CHECK-PREG-NEXT: renamable $r15, renamable $r14, renamable $r12, renamable $r13, renamable $rbx, renamable $rbp = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, killed renamable $r15(tied-def 0), killed renamable $r14(tied-def 1), killed renamable $r12(tied-def 2), killed renamable $r13(tied-def 3), killed renamable $rbx(tied-def 4), 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 1, 8, %stack.3, 0, 1, 8, %stack.4, 0, 1, 8, %stack.5, 0, 1, 8, %stack.7, 0, 1, 8, %stack.8, 0, 1, 8, %stack.2, 0, 1, 8, %stack.6, 0, 1, 8, %stack.9, 0, 1, 8, %stack.10, 0, 1, 8, %stack.11, 0, killed renamable $rbp(tied-def 5), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11)
+ ; CHECK-PREG-NEXT: renamable $r14, renamable $rbx, renamable $r15, renamable $r12, renamable $r13, renamable $rbp = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, killed renamable $r14(tied-def 0), killed renamable $rbx(tied-def 1), killed renamable $r15(tied-def 2), killed renamable $r12(tied-def 3), killed renamable $r13(tied-def 4), 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 1, 8, %stack.3, 0, 1, 8, %stack.4, 0, 1, 8, %stack.5, 0, 1, 8, %stack.7, 0, 1, 8, %stack.8, 0, 1, 8, %stack.2, 0, 1, 8, %stack.6, 0, 1, 8, %stack.9, 0, 1, 8, %stack.10, 0, 1, 8, %stack.11, 0, killed renamable $rbp(tied-def 5), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11)
; CHECK-PREG-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-PREG-NEXT: renamable $eax = MOV32rm killed renamable $rbp, 1, $noreg, 4, $noreg :: (load (s32) from %ir.gep00, addrspace 1)
; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
@@ -110,11 +110,11 @@ define i32 @test_spill(
; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1)
; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1)
- ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbx, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1)
- ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r13, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1)
- ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r12, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1)
- ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1)
- ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1)
+ ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r13, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1)
+ ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r12, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1)
+ ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1)
+ ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbx, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1)
+ ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1)
; CHECK-PREG-NEXT: RET 0, $eax
ptr addrspace(1) %arg00, ptr addrspace(1) %arg01, ptr addrspace(1) %arg02, ptr addrspace(1) %arg03, ptr addrspace(1) %arg04, ptr addrspace(1) %arg05,
ptr addrspace(1) %arg06, ptr addrspace(1) %arg07, ptr addrspace(1) %arg08, ptr addrspace(1) %arg09, ptr addrspace(1) %arg10, ptr addrspace(1) %arg11,
diff --git a/llvm/test/CodeGen/X86/statepoint-vreg.ll b/llvm/test/CodeGen/X86/statepoint-vreg.ll
index 4f05795fff793..12b1f55fa762f 100644
--- a/llvm/test/CodeGen/X86/statepoint-vreg.ll
+++ b/llvm/test/CodeGen/X86/statepoint-vreg.ll
@@ -62,16 +62,16 @@ define void @test_mixed(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(
; CHECK-NEXT: .cfi_offset %rbx, -32
; CHECK-NEXT: .cfi_offset %r14, -24
; CHECK-NEXT: .cfi_offset %r15, -16
-; CHECK-NEXT: movq %rdx, %r14
-; CHECK-NEXT: movq %rsi, %r15
-; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rsi, %r14
+; CHECK-NEXT: movq %rdi, %r15
; CHECK-NEXT: callq func at PLT
; CHECK-NEXT: .Ltmp1:
-; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: movq %r15, %rdi
; CHECK-NEXT: xorl %esi, %esi
-; CHECK-NEXT: movq %r15, %rdx
+; CHECK-NEXT: movq %r14, %rdx
; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: movq %r14, %r8
+; CHECK-NEXT: movq %rbx, %r8
; CHECK-NEXT: callq consume5 at PLT
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 24
@@ -251,17 +251,17 @@ define i1 @test_cross_bb(ptr addrspace(1) %a, i1 %external_cond) gc "statepoint-
; CHECK-NEXT: .cfi_offset %rbx, -32
; CHECK-NEXT: .cfi_offset %r14, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movl %esi, %ebp
+; CHECK-NEXT: movl %esi, %r14d
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: callq return_i1 at PLT
; CHECK-NEXT: .Ltmp7:
-; CHECK-NEXT: testb $1, %bpl
+; CHECK-NEXT: testb $1, %r14b
; CHECK-NEXT: je .LBB7_2
; CHECK-NEXT: # %bb.1: # %left
-; CHECK-NEXT: movl %eax, %r14d
+; CHECK-NEXT: movl %eax, %ebp
; CHECK-NEXT: movq %rbx, %rdi
; CHECK-NEXT: callq consume at PLT
-; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: movl %ebp, %eax
; CHECK-NEXT: jmp .LBB7_3
; CHECK-NEXT: .LBB7_2: # %right
; CHECK-NEXT: movb $1, %al
@@ -353,18 +353,18 @@ define void @test_limit(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(
; CHECK-NEXT: .cfi_offset %r12, -32
; CHECK-NEXT: .cfi_offset %r14, -24
; CHECK-NEXT: .cfi_offset %r15, -16
-; CHECK-NEXT: movq %r8, %r14
-; CHECK-NEXT: movq %rcx, %r15
-; CHECK-NEXT: movq %rdx, %r12
-; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: movq %r8, %rbx
+; CHECK-NEXT: movq %rcx, %r14
+; CHECK-NEXT: movq %rdx, %r15
+; CHECK-NEXT: movq %rsi, %r12
; CHECK-NEXT: movq %rdi, (%rsp)
; CHECK-NEXT: callq func at PLT
; CHECK-NEXT: .Ltmp11:
; CHECK-NEXT: movq (%rsp), %rdi
-; CHECK-NEXT: movq %rbx, %rsi
-; CHECK-NEXT: movq %r12, %rdx
-; CHECK-NEXT: movq %r15, %rcx
-; CHECK-NEXT: movq %r14, %r8
+; CHECK-NEXT: movq %r12, %rsi
+; CHECK-NEXT: movq %r15, %rdx
+; CHECK-NEXT: movq %r14, %rcx
+; CHECK-NEXT: movq %rbx, %r8
; CHECK-NEXT: callq consume5 at PLT
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 40
diff --git a/llvm/test/CodeGen/X86/statepoint-vreg.mir b/llvm/test/CodeGen/X86/statepoint-vreg.mir
index f02fda3e8f600..b7cbc1703f83b 100644
--- a/llvm/test/CodeGen/X86/statepoint-vreg.mir
+++ b/llvm/test/CodeGen/X86/statepoint-vreg.mir
@@ -20,12 +20,12 @@
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: .cfi_offset %rbx, -24
; CHECK-NEXT: .cfi_offset %r14, -16
- ; CHECK-NEXT: movq %rsi, %r14
- ; CHECK-NEXT: movq %rdi, %rbx
+ ; CHECK-NEXT: movq %rsi, %rbx
+ ; CHECK-NEXT: movq %rdi, %r14
; CHECK-NEXT: callq bar
; CHECK-NEXT: .Ltmp0:
- ; CHECK-NEXT: movl (%rbx), %eax
- ; CHECK-NEXT: addl (%r14), %eax
+ ; CHECK-NEXT: movl (%r14), %eax
+ ; CHECK-NEXT: addl (%rbx), %eax
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: popq %rbx
@@ -83,25 +83,25 @@
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
- ; CHECK-NEXT: .short 14
+ ; CHECK-NEXT: .short 3
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
- ; CHECK-NEXT: .short 14
+ ; CHECK-NEXT: .short 3
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
- ; CHECK-NEXT: .short 3
+ ; CHECK-NEXT: .short 14
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
- ; CHECK-NEXT: .short 3
+ ; CHECK-NEXT: .short 14
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; CHECK-NEXT: .p2align 3
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index 4dcdf56eae955..28a48af14d29f 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -41,18 +41,18 @@ define %S @negate(ptr nocapture readonly %this) {
; CHECK-LABEL: negate:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: xorl %r8d, %r8d
+; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: subq (%rsi), %rdx
; CHECK-NEXT: movl $0, %edi
; CHECK-NEXT: sbbq 8(%rsi), %rdi
-; CHECK-NEXT: movl $0, %ecx
-; CHECK-NEXT: sbbq 16(%rsi), %rcx
-; CHECK-NEXT: sbbq 24(%rsi), %r8
+; CHECK-NEXT: movl $0, %r8d
+; CHECK-NEXT: sbbq 16(%rsi), %r8
+; CHECK-NEXT: sbbq 24(%rsi), %rcx
; CHECK-NEXT: movq %rdx, (%rax)
; CHECK-NEXT: movq %rdi, 8(%rax)
-; CHECK-NEXT: movq %rcx, 16(%rax)
-; CHECK-NEXT: movq %r8, 24(%rax)
+; CHECK-NEXT: movq %r8, 16(%rax)
+; CHECK-NEXT: movq %rcx, 24(%rax)
; CHECK-NEXT: retq
entry:
%0 = load i64, ptr %this, align 8
@@ -93,25 +93,25 @@ define %S @sub(ptr nocapture readonly %this, %S %arg.b) {
; CHECK-LABEL: sub:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq (%rsi), %r10
-; CHECK-NEXT: movq 8(%rsi), %rdi
-; CHECK-NEXT: subq %rdx, %r10
+; CHECK-NEXT: movq (%rsi), %rdi
+; CHECK-NEXT: movq 8(%rsi), %r10
+; CHECK-NEXT: subq %rdx, %rdi
; CHECK-NEXT: setae %dl
; CHECK-NEXT: addb $-1, %dl
-; CHECK-NEXT: adcq $0, %rdi
-; CHECK-NEXT: setb %dl
-; CHECK-NEXT: movzbl %dl, %r11d
-; CHECK-NEXT: notq %rcx
-; CHECK-NEXT: addq %rdi, %rcx
-; CHECK-NEXT: adcq 16(%rsi), %r11
+; CHECK-NEXT: adcq $0, %r10
; CHECK-NEXT: setb %dl
; CHECK-NEXT: movzbl %dl, %edx
+; CHECK-NEXT: notq %rcx
+; CHECK-NEXT: addq %r10, %rcx
+; CHECK-NEXT: adcq 16(%rsi), %rdx
+; CHECK-NEXT: setb %r10b
+; CHECK-NEXT: movzbl %r10b, %r10d
; CHECK-NEXT: notq %r8
-; CHECK-NEXT: addq %r11, %r8
-; CHECK-NEXT: adcq 24(%rsi), %rdx
+; CHECK-NEXT: addq %rdx, %r8
+; CHECK-NEXT: adcq 24(%rsi), %r10
; CHECK-NEXT: notq %r9
-; CHECK-NEXT: addq %rdx, %r9
-; CHECK-NEXT: movq %r10, (%rax)
+; CHECK-NEXT: addq %r10, %r9
+; CHECK-NEXT: movq %rdi, (%rax)
; CHECK-NEXT: movq %rcx, 8(%rax)
; CHECK-NEXT: movq %r8, 16(%rax)
; CHECK-NEXT: movq %r9, 24(%rax)
@@ -593,21 +593,21 @@ define void @sub_U256_without_i128_or_recursive(ptr sret(%uint256) %0, ptr %1, p
; CHECK-LABEL: sub_U256_without_i128_or_recursive:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq (%rsi), %r8
-; CHECK-NEXT: movq 8(%rsi), %r9
-; CHECK-NEXT: movq 16(%rsi), %rcx
+; CHECK-NEXT: movq (%rsi), %rcx
+; CHECK-NEXT: movq 8(%rsi), %rdi
+; CHECK-NEXT: movq 16(%rsi), %r8
; CHECK-NEXT: movq 24(%rsi), %rsi
-; CHECK-NEXT: xorl %edi, %edi
-; CHECK-NEXT: subq 16(%rdx), %rcx
-; CHECK-NEXT: setb %dil
+; CHECK-NEXT: xorl %r9d, %r9d
+; CHECK-NEXT: subq 16(%rdx), %r8
+; CHECK-NEXT: setb %r9b
; CHECK-NEXT: subq 24(%rdx), %rsi
-; CHECK-NEXT: subq (%rdx), %r8
-; CHECK-NEXT: sbbq 8(%rdx), %r9
-; CHECK-NEXT: sbbq $0, %rcx
-; CHECK-NEXT: sbbq %rdi, %rsi
-; CHECK-NEXT: movq %r8, (%rax)
-; CHECK-NEXT: movq %r9, 8(%rax)
-; CHECK-NEXT: movq %rcx, 16(%rax)
+; CHECK-NEXT: subq (%rdx), %rcx
+; CHECK-NEXT: sbbq 8(%rdx), %rdi
+; CHECK-NEXT: sbbq $0, %r8
+; CHECK-NEXT: sbbq %r9, %rsi
+; CHECK-NEXT: movq %rcx, (%rax)
+; CHECK-NEXT: movq %rdi, 8(%rax)
+; CHECK-NEXT: movq %r8, 16(%rax)
; CHECK-NEXT: movq %rsi, 24(%rax)
; CHECK-NEXT: retq
%4 = load i64, ptr %1, align 8
diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll
index 88b81f12f4d67..41d2b1e1939cc 100644
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@@ -530,25 +530,25 @@ bb_end:
define void @foo_sret(ptr sret(%struct.S) %agg.result, i32 %val1, ptr swifterror %error_ptr_ref) {
; CHECK-APPLE-LABEL: foo_sret:
; CHECK-APPLE: ## %bb.0: ## %entry
-; CHECK-APPLE-NEXT: pushq %rbp
+; CHECK-APPLE-NEXT: pushq %r14
; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 16
; CHECK-APPLE-NEXT: pushq %rbx
; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 24
; CHECK-APPLE-NEXT: pushq %rax
; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 32
; CHECK-APPLE-NEXT: .cfi_offset %rbx, -24
-; CHECK-APPLE-NEXT: .cfi_offset %rbp, -16
-; CHECK-APPLE-NEXT: movl %esi, %ebp
-; CHECK-APPLE-NEXT: movq %rdi, %rbx
+; CHECK-APPLE-NEXT: .cfi_offset %r14, -16
+; CHECK-APPLE-NEXT: movl %esi, %ebx
+; CHECK-APPLE-NEXT: movq %rdi, %r14
; CHECK-APPLE-NEXT: movl $16, %edi
; CHECK-APPLE-NEXT: callq _malloc
; CHECK-APPLE-NEXT: movb $1, 8(%rax)
-; CHECK-APPLE-NEXT: movl %ebp, 4(%rbx)
+; CHECK-APPLE-NEXT: movl %ebx, 4(%r14)
; CHECK-APPLE-NEXT: movq %rax, %r12
-; CHECK-APPLE-NEXT: movq %rbx, %rax
+; CHECK-APPLE-NEXT: movq %r14, %rax
; CHECK-APPLE-NEXT: addq $8, %rsp
; CHECK-APPLE-NEXT: popq %rbx
-; CHECK-APPLE-NEXT: popq %rbp
+; CHECK-APPLE-NEXT: popq %r14
; CHECK-APPLE-NEXT: retq
;
; CHECK-O0-LABEL: foo_sret:
@@ -736,8 +736,8 @@ define float @caller_with_multiple_swifterror_values(ptr %error_ref, ptr %error_
; CHECK-APPLE-NEXT: .cfi_offset %rbx, -40
; CHECK-APPLE-NEXT: .cfi_offset %r12, -32
; CHECK-APPLE-NEXT: .cfi_offset %r14, -24
-; CHECK-APPLE-NEXT: movq %rsi, %r14
-; CHECK-APPLE-NEXT: movq %rdi, %rbx
+; CHECK-APPLE-NEXT: movq %rsi, %rbx
+; CHECK-APPLE-NEXT: movq %rdi, %r14
; CHECK-APPLE-NEXT: xorl %r12d, %r12d
; CHECK-APPLE-NEXT: callq _foo
; CHECK-APPLE-NEXT: movq %r12, %rdi
@@ -745,7 +745,7 @@ define float @caller_with_multiple_swifterror_values(ptr %error_ref, ptr %error_
; CHECK-APPLE-NEXT: jne LBB7_2
; CHECK-APPLE-NEXT: ## %bb.1: ## %cont
; CHECK-APPLE-NEXT: movzbl 8(%rdi), %eax
-; CHECK-APPLE-NEXT: movb %al, (%rbx)
+; CHECK-APPLE-NEXT: movb %al, (%r14)
; CHECK-APPLE-NEXT: LBB7_2: ## %handler
; CHECK-APPLE-NEXT: callq _free
; CHECK-APPLE-NEXT: movq %rsp, %rax
@@ -758,7 +758,7 @@ define float @caller_with_multiple_swifterror_values(ptr %error_ref, ptr %error_
; CHECK-APPLE-NEXT: jne LBB7_4
; CHECK-APPLE-NEXT: ## %bb.3: ## %cont2
; CHECK-APPLE-NEXT: movzbl 8(%rdi), %eax
-; CHECK-APPLE-NEXT: movb %al, (%r14)
+; CHECK-APPLE-NEXT: movb %al, (%rbx)
; CHECK-APPLE-NEXT: LBB7_4: ## %handler2
; CHECK-APPLE-NEXT: callq _free
; CHECK-APPLE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -1400,9 +1400,9 @@ define swiftcc void @params_in_reg(i64, i64, i64, i64, i64, i64, ptr swiftself,
; CHECK-APPLE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-APPLE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-APPLE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-APPLE-NEXT: movq %rcx, %r14
-; CHECK-APPLE-NEXT: movq %rdx, %r15
-; CHECK-APPLE-NEXT: movq %rsi, %rbx
+; CHECK-APPLE-NEXT: movq %rcx, %rbx
+; CHECK-APPLE-NEXT: movq %rdx, %r14
+; CHECK-APPLE-NEXT: movq %rsi, %r15
; CHECK-APPLE-NEXT: movq %rdi, %rbp
; CHECK-APPLE-NEXT: movl $1, %edi
; CHECK-APPLE-NEXT: movl $2, %esi
@@ -1414,9 +1414,9 @@ define swiftcc void @params_in_reg(i64, i64, i64, i64, i64, i64, ptr swiftself,
; CHECK-APPLE-NEXT: xorl %r12d, %r12d
; CHECK-APPLE-NEXT: callq _params_in_reg2
; CHECK-APPLE-NEXT: movq %rbp, %rdi
-; CHECK-APPLE-NEXT: movq %rbx, %rsi
-; CHECK-APPLE-NEXT: movq %r15, %rdx
-; CHECK-APPLE-NEXT: movq %r14, %rcx
+; CHECK-APPLE-NEXT: movq %r15, %rsi
+; CHECK-APPLE-NEXT: movq %r14, %rdx
+; CHECK-APPLE-NEXT: movq %rbx, %rcx
; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 ## 8-byte Reload
@@ -1566,13 +1566,13 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
; CHECK-APPLE-NEXT: .cfi_offset %r14, -32
; CHECK-APPLE-NEXT: .cfi_offset %r15, -24
; CHECK-APPLE-NEXT: .cfi_offset %rbp, -16
-; CHECK-APPLE-NEXT: movq %r12, %r14
+; CHECK-APPLE-NEXT: movq %r12, %rbx
; CHECK-APPLE-NEXT: movq %r13, (%rsp) ## 8-byte Spill
; CHECK-APPLE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-APPLE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-APPLE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-APPLE-NEXT: movq %rdx, %r15
-; CHECK-APPLE-NEXT: movq %rsi, %rbx
+; CHECK-APPLE-NEXT: movq %rdx, %r14
+; CHECK-APPLE-NEXT: movq %rsi, %r15
; CHECK-APPLE-NEXT: movq %rdi, %rbp
; CHECK-APPLE-NEXT: movl $1, %edi
; CHECK-APPLE-NEXT: movl $2, %esi
@@ -1585,18 +1585,18 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
; CHECK-APPLE-NEXT: callq _params_in_reg2
; CHECK-APPLE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-APPLE-NEXT: movq %rbp, %rdi
-; CHECK-APPLE-NEXT: movq %rbx, %rsi
-; CHECK-APPLE-NEXT: movq %r15, %rdx
+; CHECK-APPLE-NEXT: movq %r15, %rsi
+; CHECK-APPLE-NEXT: movq %r14, %rdx
; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
; CHECK-APPLE-NEXT: movq (%rsp), %r13 ## 8-byte Reload
-; CHECK-APPLE-NEXT: movq %r14, %r12
+; CHECK-APPLE-NEXT: movq %rbx, %r12
; CHECK-APPLE-NEXT: callq _params_and_return_in_reg2
-; CHECK-APPLE-NEXT: movq %rax, %rbx
-; CHECK-APPLE-NEXT: movq %rdx, %rbp
-; CHECK-APPLE-NEXT: movq %rcx, %r15
-; CHECK-APPLE-NEXT: movq %r8, %r14
+; CHECK-APPLE-NEXT: movq %rax, %r14
+; CHECK-APPLE-NEXT: movq %rdx, %r15
+; CHECK-APPLE-NEXT: movq %rcx, %rbp
+; CHECK-APPLE-NEXT: movq %r8, %rbx
; CHECK-APPLE-NEXT: movq %r12, (%rsp) ## 8-byte Spill
; CHECK-APPLE-NEXT: movl $1, %edi
; CHECK-APPLE-NEXT: movl $2, %esi
@@ -1607,10 +1607,10 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
; CHECK-APPLE-NEXT: xorl %r13d, %r13d
; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload
; CHECK-APPLE-NEXT: callq _params_in_reg2
-; CHECK-APPLE-NEXT: movq %rbx, %rax
-; CHECK-APPLE-NEXT: movq %rbp, %rdx
-; CHECK-APPLE-NEXT: movq %r15, %rcx
-; CHECK-APPLE-NEXT: movq %r14, %r8
+; CHECK-APPLE-NEXT: movq %r14, %rax
+; CHECK-APPLE-NEXT: movq %r15, %rdx
+; CHECK-APPLE-NEXT: movq %rbp, %rcx
+; CHECK-APPLE-NEXT: movq %rbx, %r8
; CHECK-APPLE-NEXT: movq (%rsp), %r12 ## 8-byte Reload
; CHECK-APPLE-NEXT: addq $48, %rsp
; CHECK-APPLE-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
index bdcc7ee61fecf..88142294727d7 100644
--- a/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
+++ b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
@@ -100,32 +100,32 @@ define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i3
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB1_27
; CHECK-NEXT: # %bb.1: # %if.end19
-; CHECK-NEXT: movl %esi, %r13d
-; CHECK-NEXT: movq %rdi, %r12
-; CHECK-NEXT: movl (%rax), %ebp
-; CHECK-NEXT: leal (,%rbp,4), %r14d
-; CHECK-NEXT: movl %r14d, %r15d
+; CHECK-NEXT: movl %esi, %ebp
+; CHECK-NEXT: movq %rdi, %r15
+; CHECK-NEXT: movl (%rax), %r13d
+; CHECK-NEXT: leal (,%r13,4), %ebx
+; CHECK-NEXT: movl %ebx, %r12d
; CHECK-NEXT: movl $1, %esi
-; CHECK-NEXT: movq %r15, %rdi
+; CHECK-NEXT: movq %r12, %rdi
; CHECK-NEXT: callq cli_calloc at PLT
-; CHECK-NEXT: testl %r13d, %r13d
+; CHECK-NEXT: testl %ebp, %ebp
; CHECK-NEXT: je .LBB1_26
; CHECK-NEXT: # %bb.2: # %if.end19
-; CHECK-NEXT: testl %ebp, %ebp
+; CHECK-NEXT: testl %r13d, %r13d
; CHECK-NEXT: je .LBB1_26
; CHECK-NEXT: # %bb.3: # %if.end19
-; CHECK-NEXT: movq %rax, %rbx
+; CHECK-NEXT: movq %rax, %r14
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB1_26
; CHECK-NEXT: # %bb.4: # %if.end19
-; CHECK-NEXT: cmpq %r12, %rbx
+; CHECK-NEXT: cmpq %r15, %r14
; CHECK-NEXT: jb .LBB1_26
; CHECK-NEXT: # %bb.5: # %if.end50
-; CHECK-NEXT: movq %rbx, %rdi
-; CHECK-NEXT: movq %r15, %rdx
+; CHECK-NEXT: movq %r14, %rdi
+; CHECK-NEXT: movq %r12, %rdx
; CHECK-NEXT: callq memcpy at PLT
-; CHECK-NEXT: cmpl $4, %r14d
+; CHECK-NEXT: cmpl $4, %ebx
; CHECK-NEXT: jb .LBB1_29
; CHECK-NEXT: # %bb.6: # %shared_preheader
; CHECK-NEXT: movb $32, %dl
@@ -146,13 +146,13 @@ define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i3
; CHECK-NEXT: .LBB1_9: # %outer_loop_header
; CHECK-NEXT: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB1_10 Depth 2
-; CHECK-NEXT: testl %ebp, %ebp
+; CHECK-NEXT: testl %r13d, %r13d
; CHECK-NEXT: je .LBB1_19
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_10: # %shared_loop_header
; CHECK-NEXT: # Parent Loop BB1_9 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-NEXT: testq %rbx, %rbx
+; CHECK-NEXT: testq %r14, %r14
; CHECK-NEXT: jne .LBB1_28
; CHECK-NEXT: # %bb.11: # %inner_loop_body
; CHECK-NEXT: # in Loop: Header=BB1_10 Depth=2
@@ -160,12 +160,12 @@ define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i3
; CHECK-NEXT: jns .LBB1_10
; CHECK-NEXT: # %bb.12: # %if.end96.i
; CHECK-NEXT: # in Loop: Header=BB1_9 Depth=1
-; CHECK-NEXT: cmpl $3, %ebp
+; CHECK-NEXT: cmpl $3, %r13d
; CHECK-NEXT: jae .LBB1_23
; CHECK-NEXT: # %bb.13: # %if.end287.i
; CHECK-NEXT: # in Loop: Header=BB1_9 Depth=1
; CHECK-NEXT: xorl %esi, %esi
-; CHECK-NEXT: cmpl $1, %ebp
+; CHECK-NEXT: cmpl $1, %r13d
; CHECK-NEXT: setne %dl
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB1_17
diff --git a/llvm/test/CodeGen/X86/tail-opts.ll b/llvm/test/CodeGen/X86/tail-opts.ll
index 1548e2df42bed..e21d98238aa35 100644
--- a/llvm/test/CodeGen/X86/tail-opts.ll
+++ b/llvm/test/CodeGen/X86/tail-opts.ll
@@ -95,22 +95,22 @@ declare ptr @choose(ptr, ptr)
define dso_local void @tail_duplicate_me() nounwind {
; CHECK-LABEL: tail_duplicate_me:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: callq qux at PLT
; CHECK-NEXT: movl $.Ltmp0, %edi
; CHECK-NEXT: movl $.Ltmp1, %esi
-; CHECK-NEXT: movl %eax, %ebx
+; CHECK-NEXT: movl %eax, %ebp
; CHECK-NEXT: callq choose at PLT
-; CHECK-NEXT: movq %rax, %r14
-; CHECK-NEXT: testb $1, %bl
+; CHECK-NEXT: movq %rax, %rbx
+; CHECK-NEXT: testb $1, %bpl
; CHECK-NEXT: je .LBB1_1
; CHECK-NEXT: # %bb.7: # %A
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: callq bar
; CHECK-NEXT: movl $0, GHJK(%rip)
-; CHECK-NEXT: jmpq *%r14
+; CHECK-NEXT: jmpq *%rbx
; CHECK-NEXT: .Ltmp0: # Block address taken
; CHECK-NEXT: .LBB1_4: # %return
; CHECK-NEXT: movl $1000, %edi # imm = 0x3E8
@@ -124,7 +124,7 @@ define dso_local void @tail_duplicate_me() nounwind {
; CHECK-NEXT: movl $1, %edi
; CHECK-NEXT: callq car
; CHECK-NEXT: movl $0, GHJK(%rip)
-; CHECK-NEXT: jmpq *%r14
+; CHECK-NEXT: jmpq *%rbx
; CHECK-NEXT: .Ltmp1: # Block address taken
; CHECK-NEXT: .LBB1_6: # %altret
; CHECK-NEXT: movl $1001, %edi # imm = 0x3E9
@@ -132,13 +132,13 @@ define dso_local void @tail_duplicate_me() nounwind {
; CHECK-NEXT: .LBB1_5: # %return
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB1_3: # %C
; CHECK-NEXT: movl $2, %edi
; CHECK-NEXT: callq dar
; CHECK-NEXT: movl $0, GHJK(%rip)
-; CHECK-NEXT: jmpq *%r14
+; CHECK-NEXT: jmpq *%rbx
entry:
%a = call i1 @qux()
%c = call ptr @choose(ptr blockaddress(@tail_duplicate_me, %return),
diff --git a/llvm/test/CodeGen/X86/tailcallstack64.ll b/llvm/test/CodeGen/X86/tailcallstack64.ll
index 158b777fe1fbc..77295ee0ff871 100644
--- a/llvm/test/CodeGen/X86/tailcallstack64.ll
+++ b/llvm/test/CodeGen/X86/tailcallstack64.ll
@@ -6,7 +6,7 @@
; Check that lowered arguments on the stack do not overwrite each other.
; Add %in1 %p1 to a
diff erent temporary register (%eax).
-; CHECK: movl [[A1:32|144]](%rsp), [[R1:%e..]]
+; CHECK: movl [[A1:32|144]](%rsp), [[R1:%e..|%r.*d]]
; Move param %in1 to temp register (%r10d).
; CHECK: movl [[A2:40|152]](%rsp), [[R2:%[a-z0-9]+]]
; Add %in1 %p1 to a
diff erent temporary register (%eax).
diff --git a/llvm/test/CodeGen/X86/tailccstack64.ll b/llvm/test/CodeGen/X86/tailccstack64.ll
index bd0f4a739504f..477097eac188c 100644
--- a/llvm/test/CodeGen/X86/tailccstack64.ll
+++ b/llvm/test/CodeGen/X86/tailccstack64.ll
@@ -6,7 +6,7 @@
; Check that lowered arguments on the stack do not overwrite each other.
; Add %in1 %p1 to a
diff erent temporary register (%eax).
-; CHECK: movl [[A1:32|144]](%rsp), [[R1:%e..]]
+; CHECK: movl [[A1:32|144]](%rsp), [[R1:%e..|%r.*d]]
; Move param %in1 to temp register (%r10d).
; CHECK: movl [[A2:40|152]](%rsp), [[R2:%[a-z0-9]+]]
; Add %in1 %p1 to a
diff erent temporary register (%eax).
diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll
index 2b1f38094674f..14186537daea5 100644
--- a/llvm/test/CodeGen/X86/twoaddr-lea.ll
+++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll
@@ -63,7 +63,7 @@ entry:
define void @ham() {
; CHECK-LABEL: ham:
; CHECK: ## %bb.0: ## %bb
-; CHECK-NEXT: xorl %r8d, %r8d
+; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: movq _global at GOTPCREL(%rip), %rdx
; CHECK-NEXT: movq _global2 at GOTPCREL(%rip), %rsi
; CHECK-NEXT: xorl %eax, %eax
@@ -74,16 +74,16 @@ define void @ham() {
; CHECK-NEXT: ## =>This Loop Header: Depth=1
; CHECK-NEXT: ## Child Loop BB3_7 Depth 2
; CHECK-NEXT: movl (%rdx), %edi
-; CHECK-NEXT: leal (%rdi,%rax), %ecx
-; CHECK-NEXT: movslq %ecx, %rcx
+; CHECK-NEXT: leal (%rdi,%rax), %r8d
+; CHECK-NEXT: movslq %r8d, %r8
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB3_7: ## %bb6
; CHECK-NEXT: ## Parent Loop BB3_6 Depth=1
; CHECK-NEXT: ## => This Inner Loop Header: Depth=2
; CHECK-NEXT: movq %rax, (%rsi)
-; CHECK-NEXT: movq %rcx, (%rsi)
+; CHECK-NEXT: movq %r8, (%rsi)
; CHECK-NEXT: movl %edi, (%rdx)
-; CHECK-NEXT: testb %r8b, %r8b
+; CHECK-NEXT: testb %cl, %cl
; CHECK-NEXT: jne LBB3_7
; CHECK-NEXT: ## %bb.8: ## %bb9
; CHECK-NEXT: ## in Loop: Header=BB3_6 Depth=1
diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index 1286f2da6405a..234259de2ad62 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -1031,25 +1031,25 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE-NEXT: pcmpgtd %xmm4, %xmm10
; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE-NEXT: pand %xmm11, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pand %xmm11, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
; SSE-NEXT: por %xmm9, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm9
-; SSE-NEXT: pxor %xmm8, %xmm9
+; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pxor %xmm8, %xmm4
; SSE-NEXT: paddq %xmm5, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm5
; SSE-NEXT: pxor %xmm8, %xmm5
-; SSE-NEXT: movdqa %xmm9, %xmm4
-; SSE-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: pand %xmm10, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE-NEXT: por %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm4, %xmm9
+; SSE-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE-NEXT: pand %xmm10, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: por %xmm4, %xmm1
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: pxor %xmm8, %xmm4
; SSE-NEXT: paddq %xmm6, %xmm2
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index a3be2999354c6..47635265223b4 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -310,16 +310,16 @@ define i16 @func7(i16 %x, i16 %y) nounwind {
define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-LABEL: vec:
; X64: # %bb.0:
-; X64-NEXT: pxor %xmm9, %xmm9
+; X64-NEXT: pxor %xmm2, %xmm2
; X64-NEXT: pxor %xmm3, %xmm3
; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; X64-NEXT: movq %xmm3, %rax
; X64-NEXT: movdqa %xmm1, %xmm4
-; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; X64-NEXT: movq %xmm4, %rcx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
-; X64-NEXT: movq %rax, %xmm6
+; X64-NEXT: movq %rax, %xmm8
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; X64-NEXT: movq %xmm3, %rax
; X64-NEXT: movdqa %xmm1, %xmm3
@@ -328,50 +328,50 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm3
-; X64-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
-; X64-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
-; X64-NEXT: movdqa %xmm6, %xmm3
-; X64-NEXT: pxor %xmm10, %xmm3
+; X64-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm3[0]
+; X64-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; X64-NEXT: movdqa %xmm8, %xmm3
+; X64-NEXT: pxor %xmm4, %xmm3
; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; X64-NEXT: movdqa {{.*#+}} xmm8 = [2147483649,2147483649,2147483649,2147483649]
-; X64-NEXT: pcmpeqd %xmm8, %xmm7
-; X64-NEXT: movdqa {{.*#+}} xmm2 = [9223372043297226751,9223372043297226751]
-; X64-NEXT: movdqa %xmm2, %xmm5
-; X64-NEXT: pcmpgtd %xmm3, %xmm5
-; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; X64-NEXT: pand %xmm7, %xmm4
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; X64-NEXT: por %xmm4, %xmm3
+; X64-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649,2147483649,2147483649]
+; X64-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-NEXT: movdqa {{.*#+}} xmm5 = [9223372043297226751,9223372043297226751]
+; X64-NEXT: movdqa %xmm5, %xmm9
+; X64-NEXT: pcmpgtd %xmm3, %xmm9
+; X64-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; X64-NEXT: pand %xmm7, %xmm10
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
+; X64-NEXT: por %xmm10, %xmm3
; X64-NEXT: movdqa {{.*#+}} xmm7 = [8589934591,8589934591]
-; X64-NEXT: pand %xmm3, %xmm6
+; X64-NEXT: pand %xmm3, %xmm8
; X64-NEXT: pandn %xmm7, %xmm3
-; X64-NEXT: por %xmm6, %xmm3
+; X64-NEXT: por %xmm8, %xmm3
; X64-NEXT: psrlq $1, %xmm3
-; X64-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
-; X64-NEXT: movq %xmm9, %rax
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT: movq %xmm2, %rax
; X64-NEXT: movd %xmm1, %ecx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
-; X64-NEXT: movq %rax, %xmm4
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
+; X64-NEXT: movq %rax, %xmm8
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: psrlq $32, %xmm1
; X64-NEXT: movq %xmm1, %rcx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rcx
; X64-NEXT: movq %rax, %xmm0
-; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; X64-NEXT: pxor %xmm4, %xmm10
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; X64-NEXT: pcmpeqd %xmm8, %xmm0
-; X64-NEXT: pcmpgtd %xmm10, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,2,2]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0]
+; X64-NEXT: pxor %xmm8, %xmm4
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; X64-NEXT: pcmpeqd %xmm6, %xmm0
+; X64-NEXT: pcmpgtd %xmm4, %xmm5
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
; X64-NEXT: pand %xmm0, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
; X64-NEXT: por %xmm1, %xmm0
-; X64-NEXT: pand %xmm0, %xmm4
+; X64-NEXT: pand %xmm0, %xmm8
; X64-NEXT: pandn %xmm7, %xmm0
-; X64-NEXT: por %xmm4, %xmm0
+; X64-NEXT: por %xmm8, %xmm0
; X64-NEXT: psrlq $1, %xmm0
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index a17551deb7656..b516d69c676df 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -37,7 +37,7 @@ define i32 @test2(i32 %a, i32 %b) nounwind readnone {
; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: leal (%rdi,%rsi), %eax
-; X64-NEXT: addl %eax, %eax
+; X64-NEXT: addl %eax, %eax
; X64-NEXT: retq
entry:
%tmp0 = add i32 %b, %a
@@ -530,93 +530,91 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X64-NEXT: pushq %rbx
; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %r8, %r11
-; X64-NEXT: movq %rcx, %r10
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rdi, %r12
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r13, %rax
+; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r14, %rdi
-; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq %r8, %rbp
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %rbx, %r15
+; X64-NEXT: adcq $0, %r14
; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rdi, %r9
-; X64-NEXT: adcq %rbx, %rcx
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r15, %rbx
+; X64-NEXT: adcq %r14, %r12
; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %edi
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: adcq %rdi, %rbx
+; X64-NEXT: movzbl %al, %r8d
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %r12, %r13
+; X64-NEXT: adcq %r8, %r15
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rcx, %r14
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r14, %rdi
-; X64-NEXT: adcq %r8, %rdx
-; X64-NEXT: imulq %rcx, %r11
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; X64-NEXT: addq %rbp, %r15
-; X64-NEXT: adcq %rbx, %rdi
-; X64-NEXT: adcq %rdx, %r11
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: mulq %r14
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %r12, %r8
; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: addq %r8, %r12
; X64-NEXT: adcq %rbp, %rdx
-; X64-NEXT: imulq %rcx, %r13
-; X64-NEXT: addq %rdx, %r13
-; X64-NEXT: addq %r15, %r8
-; X64-NEXT: adcq %rdi, %rax
-; X64-NEXT: adcq %r11, %r13
-; X64-NEXT: imulq %r14, %r10
-; X64-NEXT: addq %r13, %r10
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: imulq %r9, %r11
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: addq %r13, %r14
+; X64-NEXT: adcq %r15, %r12
+; X64-NEXT: adcq %rdx, %r11
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: addq %r8, %rbp
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: addq %rbp, %rax
+; X64-NEXT: adcq %r13, %rdx
+; X64-NEXT: imulq %r8, %r10
+; X64-NEXT: addq %rdx, %r10
+; X64-NEXT: addq %r14, %r15
+; X64-NEXT: adcq %r12, %rax
+; X64-NEXT: adcq %r11, %r10
+; X64-NEXT: imulq %r9, %rcx
+; X64-NEXT: addq %r10, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rdx
; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: addq %rdx, %rsi
; X64-NEXT: addq %rcx, %rsi
-; X64-NEXT: addq %r10, %rsi
-; X64-NEXT: movq %r9, 8(%r12)
+; X64-NEXT: movq %rbx, 8(%rdi)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, (%r12)
-; X64-NEXT: movq %r8, 16(%r12)
-; X64-NEXT: movq %rax, 24(%r12)
-; X64-NEXT: movl %esi, 32(%r12)
+; X64-NEXT: movq %rcx, (%rdi)
+; X64-NEXT: movq %r15, 16(%rdi)
+; X64-NEXT: movq %rax, 24(%rdi)
+; X64-NEXT: movl %esi, 32(%rdi)
; X64-NEXT: shrq $32, %rsi
; X64-NEXT: andl $4095, %esi # imm = 0xFFF
-; X64-NEXT: movw %si, 36(%r12)
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: movw %si, 36(%rdi)
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X64-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index c3849d7de86aa..6f639880dc574 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -312,41 +312,41 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT: xorb %bl, %sil
+; CHECK-BASELINE-NEXT: xorb %r12b, %sil
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT: xorb %bl, %sil
-; CHECK-BASELINE-NEXT: xorb %r12b, %dl
+; CHECK-BASELINE-NEXT: xorb %r12b, %sil
+; CHECK-BASELINE-NEXT: xorb %r15b, %dl
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT: xorb %r12b, %dl
-; CHECK-BASELINE-NEXT: xorb %r15b, %cl
+; CHECK-BASELINE-NEXT: xorb %r15b, %dl
+; CHECK-BASELINE-NEXT: xorb %r14b, %cl
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT: xorb %r15b, %cl
-; CHECK-BASELINE-NEXT: xorb %r14b, %r8b
+; CHECK-BASELINE-NEXT: xorb %r14b, %cl
+; CHECK-BASELINE-NEXT: xorb %bpl, %r8b
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b
-; CHECK-BASELINE-NEXT: xorb %r14b, %r8b
-; CHECK-BASELINE-NEXT: xorb %bpl, %r9b
+; CHECK-BASELINE-NEXT: xorb %bpl, %r8b
+; CHECK-BASELINE-NEXT: xorb %bl, %r9b
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b
-; CHECK-BASELINE-NEXT: xorb %bpl, %r9b
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT: xorb %r11b, %bpl
-; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-BASELINE-NEXT: xorb %r11b, %bpl
+; CHECK-BASELINE-NEXT: xorb %bl, %r9b
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT: xorb %r11b, %bl
+; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT: xorb %r11b, %bl
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT: xorb %r10b, %r11b
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b
; CHECK-BASELINE-NEXT: xorb %r10b, %r11b
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT: xorb %dil, %bl
-; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT: xorb %dil, %bl
-; CHECK-BASELINE-NEXT: movb %bl, 7(%rax)
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT: xorb %dil, %r10b
+; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT: xorb %dil, %r10b
+; CHECK-BASELINE-NEXT: movb %r10b, 7(%rax)
; CHECK-BASELINE-NEXT: movb %r11b, 6(%rax)
-; CHECK-BASELINE-NEXT: movb %bpl, 5(%rax)
+; CHECK-BASELINE-NEXT: movb %bl, 5(%rax)
; CHECK-BASELINE-NEXT: movb %r9b, 4(%rax)
; CHECK-BASELINE-NEXT: movb %r8b, 3(%rax)
; CHECK-BASELINE-NEXT: movb %cl, 2(%rax)
@@ -370,41 +370,41 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT: xorb %bl, %sil
+; CHECK-SSE1-NEXT: xorb %r12b, %sil
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT: xorb %bl, %sil
-; CHECK-SSE1-NEXT: xorb %r12b, %dl
+; CHECK-SSE1-NEXT: xorb %r12b, %sil
+; CHECK-SSE1-NEXT: xorb %r15b, %dl
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT: xorb %r12b, %dl
-; CHECK-SSE1-NEXT: xorb %r15b, %cl
+; CHECK-SSE1-NEXT: xorb %r15b, %dl
+; CHECK-SSE1-NEXT: xorb %r14b, %cl
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT: xorb %r15b, %cl
-; CHECK-SSE1-NEXT: xorb %r14b, %r8b
+; CHECK-SSE1-NEXT: xorb %r14b, %cl
+; CHECK-SSE1-NEXT: xorb %bpl, %r8b
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b
-; CHECK-SSE1-NEXT: xorb %r14b, %r8b
-; CHECK-SSE1-NEXT: xorb %bpl, %r9b
+; CHECK-SSE1-NEXT: xorb %bpl, %r8b
+; CHECK-SSE1-NEXT: xorb %bl, %r9b
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b
-; CHECK-SSE1-NEXT: xorb %bpl, %r9b
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT: xorb %r11b, %bpl
-; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-SSE1-NEXT: xorb %r11b, %bpl
+; CHECK-SSE1-NEXT: xorb %bl, %r9b
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT: xorb %r11b, %bl
+; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT: xorb %r11b, %bl
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT: xorb %r10b, %r11b
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b
; CHECK-SSE1-NEXT: xorb %r10b, %r11b
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT: xorb %dil, %bl
-; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT: xorb %dil, %bl
-; CHECK-SSE1-NEXT: movb %bl, 7(%rax)
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT: xorb %dil, %r10b
+; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT: xorb %dil, %r10b
+; CHECK-SSE1-NEXT: movb %r10b, 7(%rax)
; CHECK-SSE1-NEXT: movb %r11b, 6(%rax)
-; CHECK-SSE1-NEXT: movb %bpl, 5(%rax)
+; CHECK-SSE1-NEXT: movb %bl, 5(%rax)
; CHECK-SSE1-NEXT: movb %r9b, 4(%rax)
; CHECK-SSE1-NEXT: movb %r8b, 3(%rax)
; CHECK-SSE1-NEXT: movb %cl, 2(%rax)
@@ -439,18 +439,18 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
; CHECK-BASELINE-LABEL: out_v4i16:
; CHECK-BASELINE: # %bb.0:
; CHECK-BASELINE-NEXT: movq %rdi, %rax
+; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT: xorl %edi, %edx
+; CHECK-BASELINE-NEXT: xorl %r11d, %edx
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx
-; CHECK-BASELINE-NEXT: xorl %edi, %edx
-; CHECK-BASELINE-NEXT: xorl %r11d, %ecx
+; CHECK-BASELINE-NEXT: xorl %r11d, %edx
+; CHECK-BASELINE-NEXT: xorl %r10d, %ecx
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx
-; CHECK-BASELINE-NEXT: xorl %r11d, %ecx
-; CHECK-BASELINE-NEXT: xorl %r10d, %r8d
+; CHECK-BASELINE-NEXT: xorl %r10d, %ecx
+; CHECK-BASELINE-NEXT: xorl %edi, %r8d
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w
-; CHECK-BASELINE-NEXT: xorl %r10d, %r8d
+; CHECK-BASELINE-NEXT: xorl %edi, %r8d
; CHECK-BASELINE-NEXT: xorl %r9d, %esi
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si
; CHECK-BASELINE-NEXT: xorl %r9d, %esi
@@ -463,18 +463,18 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
; CHECK-SSE1-LABEL: out_v4i16:
; CHECK-SSE1: # %bb.0:
; CHECK-SSE1-NEXT: movq %rdi, %rax
+; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT: xorl %edi, %edx
+; CHECK-SSE1-NEXT: xorl %r11d, %edx
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx
-; CHECK-SSE1-NEXT: xorl %edi, %edx
-; CHECK-SSE1-NEXT: xorl %r11d, %ecx
+; CHECK-SSE1-NEXT: xorl %r11d, %edx
+; CHECK-SSE1-NEXT: xorl %r10d, %ecx
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx
-; CHECK-SSE1-NEXT: xorl %r11d, %ecx
-; CHECK-SSE1-NEXT: xorl %r10d, %r8d
+; CHECK-SSE1-NEXT: xorl %r10d, %ecx
+; CHECK-SSE1-NEXT: xorl %edi, %r8d
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w
-; CHECK-SSE1-NEXT: xorl %r10d, %r8d
+; CHECK-SSE1-NEXT: xorl %edi, %r8d
; CHECK-SSE1-NEXT: xorl %r9d, %esi
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si
; CHECK-SSE1-NEXT: xorl %r9d, %esi
@@ -506,15 +506,15 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
; CHECK-BASELINE-LABEL: out_v4i16_undef:
; CHECK-BASELINE: # %bb.0:
; CHECK-BASELINE-NEXT: movq %rdi, %rax
-; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx
-; CHECK-BASELINE-NEXT: xorl %edi, %edx
+; CHECK-BASELINE-NEXT: xorl %r10d, %edx
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx
-; CHECK-BASELINE-NEXT: xorl %edi, %edx
-; CHECK-BASELINE-NEXT: xorl %r10d, %r8d
+; CHECK-BASELINE-NEXT: xorl %r10d, %edx
+; CHECK-BASELINE-NEXT: xorl %edi, %r8d
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w
-; CHECK-BASELINE-NEXT: xorl %r10d, %r8d
+; CHECK-BASELINE-NEXT: xorl %edi, %r8d
; CHECK-BASELINE-NEXT: xorl %r9d, %esi
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si
; CHECK-BASELINE-NEXT: xorl %r9d, %esi
@@ -527,15 +527,15 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
; CHECK-SSE1-LABEL: out_v4i16_undef:
; CHECK-SSE1: # %bb.0:
; CHECK-SSE1-NEXT: movq %rdi, %rax
-; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx
-; CHECK-SSE1-NEXT: xorl %edi, %edx
+; CHECK-SSE1-NEXT: xorl %r10d, %edx
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx
-; CHECK-SSE1-NEXT: xorl %edi, %edx
-; CHECK-SSE1-NEXT: xorl %r10d, %r8d
+; CHECK-SSE1-NEXT: xorl %r10d, %edx
+; CHECK-SSE1-NEXT: xorl %edi, %r8d
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w
-; CHECK-SSE1-NEXT: xorl %r10d, %r8d
+; CHECK-SSE1-NEXT: xorl %edi, %r8d
; CHECK-SSE1-NEXT: xorl %r9d, %esi
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si
; CHECK-SSE1-NEXT: xorl %r9d, %esi
@@ -637,16 +637,16 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
; CHECK-BASELINE-NEXT: movl %edx, %r11d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT: xorb %bl, %sil
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT: xorb %r10b, %sil
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT: xorb %bl, %sil
+; CHECK-BASELINE-NEXT: xorb %r10b, %sil
; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT: xorb %dl, %r11b
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b
@@ -655,21 +655,21 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: xorb %r10b, %r8b
+; CHECK-BASELINE-NEXT: xorb %bl, %r8b
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b
-; CHECK-BASELINE-NEXT: xorb %r10b, %r8b
+; CHECK-BASELINE-NEXT: xorb %bl, %r8b
; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: xorb %r12b, %r9b
+; CHECK-BASELINE-NEXT: xorb %r14b, %r9b
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b
-; CHECK-BASELINE-NEXT: xorb %r12b, %r9b
+; CHECK-BASELINE-NEXT: xorb %r14b, %r9b
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT: xorb %r12b, %r14b
+; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT: xorb %r12b, %r14b
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-BASELINE-NEXT: xorb %bpl, %r12b
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b
; CHECK-BASELINE-NEXT: xorb %bpl, %r12b
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT: xorb %r14b, %bpl
-; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-BASELINE-NEXT: xorb %r14b, %bpl
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
; CHECK-BASELINE-NEXT: xorb %r15b, %sil
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil
@@ -693,11 +693,11 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
; CHECK-BASELINE-NEXT: xorb %al, %r15b
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b
; CHECK-BASELINE-NEXT: xorb %al, %r15b
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT: xorb %al, %r14b
-; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-BASELINE-NEXT: xorb %al, %r14b
+; CHECK-BASELINE-NEXT: xorb %al, %bpl
+; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT: xorb %al, %bpl
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT: xorb %al, %bl
@@ -716,14 +716,14 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
; CHECK-BASELINE-NEXT: movb %r10b, 15(%rdi)
; CHECK-BASELINE-NEXT: movb %al, 14(%rdi)
; CHECK-BASELINE-NEXT: movb %bl, 13(%rdi)
-; CHECK-BASELINE-NEXT: movb %r14b, 12(%rdi)
+; CHECK-BASELINE-NEXT: movb %bpl, 12(%rdi)
; CHECK-BASELINE-NEXT: movb %r15b, 11(%rdi)
; CHECK-BASELINE-NEXT: movb %r13b, 10(%rdi)
; CHECK-BASELINE-NEXT: movb %cl, 9(%rdi)
; CHECK-BASELINE-NEXT: movb %dl, 8(%rdi)
; CHECK-BASELINE-NEXT: movb %sil, 7(%rdi)
-; CHECK-BASELINE-NEXT: movb %bpl, 6(%rdi)
-; CHECK-BASELINE-NEXT: movb %r12b, 5(%rdi)
+; CHECK-BASELINE-NEXT: movb %r12b, 6(%rdi)
+; CHECK-BASELINE-NEXT: movb %r14b, 5(%rdi)
; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi)
; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT: movb %al, 3(%rdi)
@@ -752,16 +752,16 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
; CHECK-SSE1-NEXT: movl %edx, %r11d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT: xorb %bl, %sil
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT: xorb %r10b, %sil
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT: xorb %bl, %sil
+; CHECK-SSE1-NEXT: xorb %r10b, %sil
; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT: xorb %dl, %r11b
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b
@@ -770,21 +770,21 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: xorb %r10b, %r8b
+; CHECK-SSE1-NEXT: xorb %bl, %r8b
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b
-; CHECK-SSE1-NEXT: xorb %r10b, %r8b
+; CHECK-SSE1-NEXT: xorb %bl, %r8b
; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: xorb %r12b, %r9b
+; CHECK-SSE1-NEXT: xorb %r14b, %r9b
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b
-; CHECK-SSE1-NEXT: xorb %r12b, %r9b
+; CHECK-SSE1-NEXT: xorb %r14b, %r9b
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT: xorb %r12b, %r14b
+; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT: xorb %r12b, %r14b
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-SSE1-NEXT: xorb %bpl, %r12b
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b
; CHECK-SSE1-NEXT: xorb %bpl, %r12b
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT: xorb %r14b, %bpl
-; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-SSE1-NEXT: xorb %r14b, %bpl
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
; CHECK-SSE1-NEXT: xorb %r15b, %sil
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil
@@ -808,11 +808,11 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
; CHECK-SSE1-NEXT: xorb %al, %r15b
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b
; CHECK-SSE1-NEXT: xorb %al, %r15b
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT: xorb %al, %r14b
-; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-SSE1-NEXT: xorb %al, %r14b
+; CHECK-SSE1-NEXT: xorb %al, %bpl
+; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT: xorb %al, %bpl
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT: xorb %al, %bl
@@ -831,14 +831,14 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
; CHECK-SSE1-NEXT: movb %r10b, 15(%rdi)
; CHECK-SSE1-NEXT: movb %al, 14(%rdi)
; CHECK-SSE1-NEXT: movb %bl, 13(%rdi)
-; CHECK-SSE1-NEXT: movb %r14b, 12(%rdi)
+; CHECK-SSE1-NEXT: movb %bpl, 12(%rdi)
; CHECK-SSE1-NEXT: movb %r15b, 11(%rdi)
; CHECK-SSE1-NEXT: movb %r13b, 10(%rdi)
; CHECK-SSE1-NEXT: movb %cl, 9(%rdi)
; CHECK-SSE1-NEXT: movb %dl, 8(%rdi)
; CHECK-SSE1-NEXT: movb %sil, 7(%rdi)
-; CHECK-SSE1-NEXT: movb %bpl, 6(%rdi)
-; CHECK-SSE1-NEXT: movb %r12b, 5(%rdi)
+; CHECK-SSE1-NEXT: movb %r12b, 6(%rdi)
+; CHECK-SSE1-NEXT: movb %r14b, 5(%rdi)
; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi)
; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT: movb %al, 3(%rdi)
@@ -883,44 +883,44 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
; CHECK-BASELINE-NEXT: pushq %r12
; CHECK-BASELINE-NEXT: pushq %rbx
; CHECK-BASELINE-NEXT: movq %rdi, %rax
+; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d
; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d
; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d
-; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT: xorl %ebp, %esi
+; CHECK-BASELINE-NEXT: xorl %r12d, %esi
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si
-; CHECK-BASELINE-NEXT: xorl %ebp, %esi
-; CHECK-BASELINE-NEXT: xorl %ebx, %edx
+; CHECK-BASELINE-NEXT: xorl %r12d, %esi
+; CHECK-BASELINE-NEXT: xorl %r15d, %edx
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx
-; CHECK-BASELINE-NEXT: xorl %ebx, %edx
-; CHECK-BASELINE-NEXT: xorl %edi, %ecx
+; CHECK-BASELINE-NEXT: xorl %r15d, %edx
+; CHECK-BASELINE-NEXT: xorl %r14d, %ecx
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx
-; CHECK-BASELINE-NEXT: xorl %edi, %ecx
-; CHECK-BASELINE-NEXT: xorl %r12d, %r8d
+; CHECK-BASELINE-NEXT: xorl %r14d, %ecx
+; CHECK-BASELINE-NEXT: xorl %ebp, %r8d
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w
-; CHECK-BASELINE-NEXT: xorl %r12d, %r8d
-; CHECK-BASELINE-NEXT: xorl %r15d, %r9d
+; CHECK-BASELINE-NEXT: xorl %ebp, %r8d
+; CHECK-BASELINE-NEXT: xorl %ebx, %r9d
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w
-; CHECK-BASELINE-NEXT: xorl %r15d, %r9d
-; CHECK-BASELINE-NEXT: movl %r14d, %edi
-; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %di
-; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di
-; CHECK-BASELINE-NEXT: xorl %r14d, %edi
+; CHECK-BASELINE-NEXT: xorl %ebx, %r9d
; CHECK-BASELINE-NEXT: movl %r11d, %ebx
; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %bx
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx
; CHECK-BASELINE-NEXT: xorl %r11d, %ebx
-; CHECK-BASELINE-NEXT: movl %r10d, %ebp
-; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %bp
-; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bp
-; CHECK-BASELINE-NEXT: xorl %r10d, %ebp
-; CHECK-BASELINE-NEXT: movw %bp, 14(%rax)
-; CHECK-BASELINE-NEXT: movw %bx, 12(%rax)
-; CHECK-BASELINE-NEXT: movw %di, 10(%rax)
+; CHECK-BASELINE-NEXT: movl %r10d, %r11d
+; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r11w
+; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w
+; CHECK-BASELINE-NEXT: xorl %r10d, %r11d
+; CHECK-BASELINE-NEXT: movl %edi, %r10d
+; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r10w
+; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w
+; CHECK-BASELINE-NEXT: xorl %edi, %r10d
+; CHECK-BASELINE-NEXT: movw %r10w, 14(%rax)
+; CHECK-BASELINE-NEXT: movw %r11w, 12(%rax)
+; CHECK-BASELINE-NEXT: movw %bx, 10(%rax)
; CHECK-BASELINE-NEXT: movw %r9w, 8(%rax)
; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax)
; CHECK-BASELINE-NEXT: movw %cx, 4(%rax)
@@ -941,44 +941,44 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
; CHECK-SSE1-NEXT: pushq %r12
; CHECK-SSE1-NEXT: pushq %rbx
; CHECK-SSE1-NEXT: movq %rdi, %rax
+; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d
; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d
; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d
-; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT: xorl %ebp, %esi
+; CHECK-SSE1-NEXT: xorl %r12d, %esi
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si
-; CHECK-SSE1-NEXT: xorl %ebp, %esi
-; CHECK-SSE1-NEXT: xorl %ebx, %edx
+; CHECK-SSE1-NEXT: xorl %r12d, %esi
+; CHECK-SSE1-NEXT: xorl %r15d, %edx
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx
-; CHECK-SSE1-NEXT: xorl %ebx, %edx
-; CHECK-SSE1-NEXT: xorl %edi, %ecx
+; CHECK-SSE1-NEXT: xorl %r15d, %edx
+; CHECK-SSE1-NEXT: xorl %r14d, %ecx
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx
-; CHECK-SSE1-NEXT: xorl %edi, %ecx
-; CHECK-SSE1-NEXT: xorl %r12d, %r8d
+; CHECK-SSE1-NEXT: xorl %r14d, %ecx
+; CHECK-SSE1-NEXT: xorl %ebp, %r8d
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w
-; CHECK-SSE1-NEXT: xorl %r12d, %r8d
-; CHECK-SSE1-NEXT: xorl %r15d, %r9d
+; CHECK-SSE1-NEXT: xorl %ebp, %r8d
+; CHECK-SSE1-NEXT: xorl %ebx, %r9d
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w
-; CHECK-SSE1-NEXT: xorl %r15d, %r9d
-; CHECK-SSE1-NEXT: movl %r14d, %edi
-; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %di
-; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di
-; CHECK-SSE1-NEXT: xorl %r14d, %edi
+; CHECK-SSE1-NEXT: xorl %ebx, %r9d
; CHECK-SSE1-NEXT: movl %r11d, %ebx
; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %bx
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx
; CHECK-SSE1-NEXT: xorl %r11d, %ebx
-; CHECK-SSE1-NEXT: movl %r10d, %ebp
-; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %bp
-; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bp
-; CHECK-SSE1-NEXT: xorl %r10d, %ebp
-; CHECK-SSE1-NEXT: movw %bp, 14(%rax)
-; CHECK-SSE1-NEXT: movw %bx, 12(%rax)
-; CHECK-SSE1-NEXT: movw %di, 10(%rax)
+; CHECK-SSE1-NEXT: movl %r10d, %r11d
+; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r11w
+; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w
+; CHECK-SSE1-NEXT: xorl %r10d, %r11d
+; CHECK-SSE1-NEXT: movl %edi, %r10d
+; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r10w
+; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w
+; CHECK-SSE1-NEXT: xorl %edi, %r10d
+; CHECK-SSE1-NEXT: movw %r10w, 14(%rax)
+; CHECK-SSE1-NEXT: movw %r11w, 12(%rax)
+; CHECK-SSE1-NEXT: movw %bx, 10(%rax)
; CHECK-SSE1-NEXT: movw %r9w, 8(%rax)
; CHECK-SSE1-NEXT: movw %r8w, 6(%rax)
; CHECK-SSE1-NEXT: movw %cx, 4(%rax)
@@ -1013,30 +1013,30 @@ define <4 x i32> @out_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: out_v4i32:
; CHECK-BASELINE: # %bb.0:
; CHECK-BASELINE-NEXT: movq %rdi, %rax
-; CHECK-BASELINE-NEXT: movl 12(%rdx), %r8d
-; CHECK-BASELINE-NEXT: movl 8(%rdx), %r9d
-; CHECK-BASELINE-NEXT: movl (%rdx), %edi
+; CHECK-BASELINE-NEXT: movl 12(%rdx), %edi
+; CHECK-BASELINE-NEXT: movl 8(%rdx), %r8d
+; CHECK-BASELINE-NEXT: movl (%rdx), %r9d
; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d
-; CHECK-BASELINE-NEXT: movl (%rsi), %r11d
-; CHECK-BASELINE-NEXT: xorl %edi, %r11d
-; CHECK-BASELINE-NEXT: andl (%rcx), %r11d
-; CHECK-BASELINE-NEXT: xorl %edi, %r11d
-; CHECK-BASELINE-NEXT: movl 4(%rsi), %edi
-; CHECK-BASELINE-NEXT: xorl %r10d, %edi
-; CHECK-BASELINE-NEXT: andl 4(%rcx), %edi
-; CHECK-BASELINE-NEXT: xorl %r10d, %edi
-; CHECK-BASELINE-NEXT: movl 8(%rsi), %edx
+; CHECK-BASELINE-NEXT: movl (%rsi), %edx
; CHECK-BASELINE-NEXT: xorl %r9d, %edx
-; CHECK-BASELINE-NEXT: andl 8(%rcx), %edx
+; CHECK-BASELINE-NEXT: andl (%rcx), %edx
; CHECK-BASELINE-NEXT: xorl %r9d, %edx
+; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d
+; CHECK-BASELINE-NEXT: xorl %r10d, %r9d
+; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d
+; CHECK-BASELINE-NEXT: xorl %r10d, %r9d
+; CHECK-BASELINE-NEXT: movl 8(%rsi), %r10d
+; CHECK-BASELINE-NEXT: xorl %r8d, %r10d
+; CHECK-BASELINE-NEXT: andl 8(%rcx), %r10d
+; CHECK-BASELINE-NEXT: xorl %r8d, %r10d
; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi
-; CHECK-BASELINE-NEXT: xorl %r8d, %esi
+; CHECK-BASELINE-NEXT: xorl %edi, %esi
; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi
-; CHECK-BASELINE-NEXT: xorl %r8d, %esi
+; CHECK-BASELINE-NEXT: xorl %edi, %esi
; CHECK-BASELINE-NEXT: movl %esi, 12(%rax)
-; CHECK-BASELINE-NEXT: movl %edx, 8(%rax)
-; CHECK-BASELINE-NEXT: movl %edi, 4(%rax)
-; CHECK-BASELINE-NEXT: movl %r11d, (%rax)
+; CHECK-BASELINE-NEXT: movl %r10d, 8(%rax)
+; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax)
+; CHECK-BASELINE-NEXT: movl %edx, (%rax)
; CHECK-BASELINE-NEXT: retq
;
; CHECK-SSE1-LABEL: out_v4i32:
@@ -1079,27 +1079,27 @@ define <4 x i32> @out_v4i32_undef(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: out_v4i32_undef:
; CHECK-BASELINE: # %bb.0:
; CHECK-BASELINE-NEXT: movq %rdi, %rax
-; CHECK-BASELINE-NEXT: movl 8(%rsi), %r9d
+; CHECK-BASELINE-NEXT: movl 8(%rsi), %edi
; CHECK-BASELINE-NEXT: movl 12(%rdx), %r8d
-; CHECK-BASELINE-NEXT: movl (%rdx), %edi
-; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d
-; CHECK-BASELINE-NEXT: andl 8(%rcx), %r9d
-; CHECK-BASELINE-NEXT: movl (%rsi), %edx
-; CHECK-BASELINE-NEXT: xorl %edi, %edx
-; CHECK-BASELINE-NEXT: andl (%rcx), %edx
-; CHECK-BASELINE-NEXT: xorl %edi, %edx
-; CHECK-BASELINE-NEXT: movl 4(%rsi), %edi
-; CHECK-BASELINE-NEXT: xorl %r10d, %edi
-; CHECK-BASELINE-NEXT: andl 4(%rcx), %edi
-; CHECK-BASELINE-NEXT: xorl %r10d, %edi
-; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi
-; CHECK-BASELINE-NEXT: xorl %r8d, %esi
-; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi
-; CHECK-BASELINE-NEXT: xorl %r8d, %esi
-; CHECK-BASELINE-NEXT: movl %r9d, 8(%rax)
-; CHECK-BASELINE-NEXT: movl %esi, 12(%rax)
-; CHECK-BASELINE-NEXT: movl %edi, 4(%rax)
-; CHECK-BASELINE-NEXT: movl %edx, (%rax)
+; CHECK-BASELINE-NEXT: movl (%rdx), %r9d
+; CHECK-BASELINE-NEXT: movl 4(%rdx), %edx
+; CHECK-BASELINE-NEXT: andl 8(%rcx), %edi
+; CHECK-BASELINE-NEXT: movl (%rsi), %r10d
+; CHECK-BASELINE-NEXT: xorl %r9d, %r10d
+; CHECK-BASELINE-NEXT: andl (%rcx), %r10d
+; CHECK-BASELINE-NEXT: xorl %r9d, %r10d
+; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d
+; CHECK-BASELINE-NEXT: xorl %edx, %r9d
+; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d
+; CHECK-BASELINE-NEXT: xorl %edx, %r9d
+; CHECK-BASELINE-NEXT: movl 12(%rsi), %edx
+; CHECK-BASELINE-NEXT: xorl %r8d, %edx
+; CHECK-BASELINE-NEXT: andl 12(%rcx), %edx
+; CHECK-BASELINE-NEXT: xorl %r8d, %edx
+; CHECK-BASELINE-NEXT: movl %edi, 8(%rax)
+; CHECK-BASELINE-NEXT: movl %edx, 12(%rax)
+; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax)
+; CHECK-BASELINE-NEXT: movl %r10d, (%rax)
; CHECK-BASELINE-NEXT: retq
;
; CHECK-SSE1-LABEL: out_v4i32_undef:
@@ -1210,21 +1210,21 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %ebp
-; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r14d
-; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r15d
-; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r12d
-; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %esi
+; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %ebx
+; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %ebp
+; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r14d
+; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r15d
+; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %r12d
; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %r13d
; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edx
; CHECK-BASELINE-NEXT: movzbl 2(%r8), %edi
; CHECK-BASELINE-NEXT: movzbl (%r8), %eax
; CHECK-BASELINE-NEXT: movzbl 1(%r8), %ecx
-; CHECK-BASELINE-NEXT: movzbl (%r9), %ebx
-; CHECK-BASELINE-NEXT: xorb %al, %bl
-; CHECK-BASELINE-NEXT: andb (%r10), %bl
-; CHECK-BASELINE-NEXT: xorb %al, %bl
-; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl (%rsi), %esi
+; CHECK-BASELINE-NEXT: xorb %al, %sil
+; CHECK-BASELINE-NEXT: andb (%r10), %sil
+; CHECK-BASELINE-NEXT: xorb %al, %sil
+; CHECK-BASELINE-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 1(%r9), %eax
; CHECK-BASELINE-NEXT: xorb %cl, %al
; CHECK-BASELINE-NEXT: andb 1(%r10), %al
@@ -1246,29 +1246,29 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: xorb %r13b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 5(%r9), %eax
-; CHECK-BASELINE-NEXT: xorb %sil, %al
+; CHECK-BASELINE-NEXT: xorb %r12b, %al
; CHECK-BASELINE-NEXT: andb 5(%r10), %al
-; CHECK-BASELINE-NEXT: xorb %sil, %al
+; CHECK-BASELINE-NEXT: xorb %r12b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 6(%r9), %eax
-; CHECK-BASELINE-NEXT: xorb %r12b, %al
+; CHECK-BASELINE-NEXT: xorb %r15b, %al
; CHECK-BASELINE-NEXT: andb 6(%r10), %al
-; CHECK-BASELINE-NEXT: xorb %r12b, %al
+; CHECK-BASELINE-NEXT: xorb %r15b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 7(%r9), %eax
-; CHECK-BASELINE-NEXT: xorb %r15b, %al
+; CHECK-BASELINE-NEXT: xorb %r14b, %al
; CHECK-BASELINE-NEXT: andb 7(%r10), %al
-; CHECK-BASELINE-NEXT: xorb %r15b, %al
+; CHECK-BASELINE-NEXT: xorb %r14b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 8(%r9), %eax
-; CHECK-BASELINE-NEXT: xorb %r14b, %al
+; CHECK-BASELINE-NEXT: xorb %bpl, %al
; CHECK-BASELINE-NEXT: andb 8(%r10), %al
-; CHECK-BASELINE-NEXT: xorb %r14b, %al
+; CHECK-BASELINE-NEXT: xorb %bpl, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 9(%r9), %eax
-; CHECK-BASELINE-NEXT: xorb %bpl, %al
+; CHECK-BASELINE-NEXT: xorb %bl, %al
; CHECK-BASELINE-NEXT: andb 9(%r10), %al
-; CHECK-BASELINE-NEXT: xorb %bpl, %al
+; CHECK-BASELINE-NEXT: xorb %bl, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 10(%r9), %eax
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -1357,10 +1357,10 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: andb 24(%r10), %r14b
; CHECK-BASELINE-NEXT: xorb %al, %r14b
; CHECK-BASELINE-NEXT: movzbl 25(%r8), %eax
-; CHECK-BASELINE-NEXT: movzbl 25(%r9), %ebp
-; CHECK-BASELINE-NEXT: xorb %al, %bpl
-; CHECK-BASELINE-NEXT: andb 25(%r10), %bpl
-; CHECK-BASELINE-NEXT: xorb %al, %bpl
+; CHECK-BASELINE-NEXT: movzbl 25(%r9), %ebx
+; CHECK-BASELINE-NEXT: xorb %al, %bl
+; CHECK-BASELINE-NEXT: andb 25(%r10), %bl
+; CHECK-BASELINE-NEXT: xorb %al, %bl
; CHECK-BASELINE-NEXT: movzbl 26(%r8), %eax
; CHECK-BASELINE-NEXT: movzbl 26(%r9), %edi
; CHECK-BASELINE-NEXT: xorb %al, %dil
@@ -1381,23 +1381,23 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: andb 29(%r10), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: movzbl 30(%r8), %ebx
+; CHECK-BASELINE-NEXT: movzbl 30(%r8), %ebp
; CHECK-BASELINE-NEXT: movzbl 30(%r9), %eax
-; CHECK-BASELINE-NEXT: xorb %bl, %al
+; CHECK-BASELINE-NEXT: xorb %bpl, %al
; CHECK-BASELINE-NEXT: andb 30(%r10), %al
-; CHECK-BASELINE-NEXT: xorb %bl, %al
+; CHECK-BASELINE-NEXT: xorb %bpl, %al
; CHECK-BASELINE-NEXT: movzbl 31(%r8), %r8d
-; CHECK-BASELINE-NEXT: movzbl 31(%r9), %ebx
-; CHECK-BASELINE-NEXT: xorb %r8b, %bl
-; CHECK-BASELINE-NEXT: andb 31(%r10), %bl
-; CHECK-BASELINE-NEXT: xorb %r8b, %bl
-; CHECK-BASELINE-NEXT: movb %bl, 31(%r11)
+; CHECK-BASELINE-NEXT: movzbl 31(%r9), %r9d
+; CHECK-BASELINE-NEXT: xorb %r8b, %r9b
+; CHECK-BASELINE-NEXT: andb 31(%r10), %r9b
+; CHECK-BASELINE-NEXT: xorb %r8b, %r9b
+; CHECK-BASELINE-NEXT: movb %r9b, 31(%r11)
; CHECK-BASELINE-NEXT: movb %al, 30(%r11)
; CHECK-BASELINE-NEXT: movb %cl, 29(%r11)
; CHECK-BASELINE-NEXT: movb %dl, 28(%r11)
; CHECK-BASELINE-NEXT: movb %sil, 27(%r11)
; CHECK-BASELINE-NEXT: movb %dil, 26(%r11)
-; CHECK-BASELINE-NEXT: movb %bpl, 25(%r11)
+; CHECK-BASELINE-NEXT: movb %bl, 25(%r11)
; CHECK-BASELINE-NEXT: movb %r14b, 24(%r11)
; CHECK-BASELINE-NEXT: movb %r15b, 23(%r11)
; CHECK-BASELINE-NEXT: movb %r12b, 22(%r11)
@@ -1477,21 +1477,21 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 9(%rdx), %ebp
-; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r14d
-; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r15d
-; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r12d
-; CHECK-SSE1-NEXT: movzbl 5(%rdx), %esi
+; CHECK-SSE1-NEXT: movzbl 9(%rdx), %ebx
+; CHECK-SSE1-NEXT: movzbl 8(%rdx), %ebp
+; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r14d
+; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r15d
+; CHECK-SSE1-NEXT: movzbl 5(%rdx), %r12d
; CHECK-SSE1-NEXT: movzbl 4(%rdx), %r13d
; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edx
; CHECK-SSE1-NEXT: movzbl 2(%r8), %edi
; CHECK-SSE1-NEXT: movzbl (%r8), %eax
; CHECK-SSE1-NEXT: movzbl 1(%r8), %ecx
-; CHECK-SSE1-NEXT: movzbl (%r9), %ebx
-; CHECK-SSE1-NEXT: xorb %al, %bl
-; CHECK-SSE1-NEXT: andb (%r10), %bl
-; CHECK-SSE1-NEXT: xorb %al, %bl
-; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl (%rsi), %esi
+; CHECK-SSE1-NEXT: xorb %al, %sil
+; CHECK-SSE1-NEXT: andb (%r10), %sil
+; CHECK-SSE1-NEXT: xorb %al, %sil
+; CHECK-SSE1-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 1(%r9), %eax
; CHECK-SSE1-NEXT: xorb %cl, %al
; CHECK-SSE1-NEXT: andb 1(%r10), %al
@@ -1513,29 +1513,29 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: xorb %r13b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 5(%r9), %eax
-; CHECK-SSE1-NEXT: xorb %sil, %al
+; CHECK-SSE1-NEXT: xorb %r12b, %al
; CHECK-SSE1-NEXT: andb 5(%r10), %al
-; CHECK-SSE1-NEXT: xorb %sil, %al
+; CHECK-SSE1-NEXT: xorb %r12b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 6(%r9), %eax
-; CHECK-SSE1-NEXT: xorb %r12b, %al
+; CHECK-SSE1-NEXT: xorb %r15b, %al
; CHECK-SSE1-NEXT: andb 6(%r10), %al
-; CHECK-SSE1-NEXT: xorb %r12b, %al
+; CHECK-SSE1-NEXT: xorb %r15b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 7(%r9), %eax
-; CHECK-SSE1-NEXT: xorb %r15b, %al
+; CHECK-SSE1-NEXT: xorb %r14b, %al
; CHECK-SSE1-NEXT: andb 7(%r10), %al
-; CHECK-SSE1-NEXT: xorb %r15b, %al
+; CHECK-SSE1-NEXT: xorb %r14b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 8(%r9), %eax
-; CHECK-SSE1-NEXT: xorb %r14b, %al
+; CHECK-SSE1-NEXT: xorb %bpl, %al
; CHECK-SSE1-NEXT: andb 8(%r10), %al
-; CHECK-SSE1-NEXT: xorb %r14b, %al
+; CHECK-SSE1-NEXT: xorb %bpl, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 9(%r9), %eax
-; CHECK-SSE1-NEXT: xorb %bpl, %al
+; CHECK-SSE1-NEXT: xorb %bl, %al
; CHECK-SSE1-NEXT: andb 9(%r10), %al
-; CHECK-SSE1-NEXT: xorb %bpl, %al
+; CHECK-SSE1-NEXT: xorb %bl, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 10(%r9), %eax
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -1624,10 +1624,10 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: andb 24(%r10), %r14b
; CHECK-SSE1-NEXT: xorb %al, %r14b
; CHECK-SSE1-NEXT: movzbl 25(%r8), %eax
-; CHECK-SSE1-NEXT: movzbl 25(%r9), %ebp
-; CHECK-SSE1-NEXT: xorb %al, %bpl
-; CHECK-SSE1-NEXT: andb 25(%r10), %bpl
-; CHECK-SSE1-NEXT: xorb %al, %bpl
+; CHECK-SSE1-NEXT: movzbl 25(%r9), %ebx
+; CHECK-SSE1-NEXT: xorb %al, %bl
+; CHECK-SSE1-NEXT: andb 25(%r10), %bl
+; CHECK-SSE1-NEXT: xorb %al, %bl
; CHECK-SSE1-NEXT: movzbl 26(%r8), %eax
; CHECK-SSE1-NEXT: movzbl 26(%r9), %edi
; CHECK-SSE1-NEXT: xorb %al, %dil
@@ -1648,23 +1648,23 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: andb 29(%r10), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: movzbl 30(%r8), %ebx
+; CHECK-SSE1-NEXT: movzbl 30(%r8), %ebp
; CHECK-SSE1-NEXT: movzbl 30(%r9), %eax
-; CHECK-SSE1-NEXT: xorb %bl, %al
+; CHECK-SSE1-NEXT: xorb %bpl, %al
; CHECK-SSE1-NEXT: andb 30(%r10), %al
-; CHECK-SSE1-NEXT: xorb %bl, %al
+; CHECK-SSE1-NEXT: xorb %bpl, %al
; CHECK-SSE1-NEXT: movzbl 31(%r8), %r8d
-; CHECK-SSE1-NEXT: movzbl 31(%r9), %ebx
-; CHECK-SSE1-NEXT: xorb %r8b, %bl
-; CHECK-SSE1-NEXT: andb 31(%r10), %bl
-; CHECK-SSE1-NEXT: xorb %r8b, %bl
-; CHECK-SSE1-NEXT: movb %bl, 31(%r11)
+; CHECK-SSE1-NEXT: movzbl 31(%r9), %r9d
+; CHECK-SSE1-NEXT: xorb %r8b, %r9b
+; CHECK-SSE1-NEXT: andb 31(%r10), %r9b
+; CHECK-SSE1-NEXT: xorb %r8b, %r9b
+; CHECK-SSE1-NEXT: movb %r9b, 31(%r11)
; CHECK-SSE1-NEXT: movb %al, 30(%r11)
; CHECK-SSE1-NEXT: movb %cl, 29(%r11)
; CHECK-SSE1-NEXT: movb %dl, 28(%r11)
; CHECK-SSE1-NEXT: movb %sil, 27(%r11)
; CHECK-SSE1-NEXT: movb %dil, 26(%r11)
-; CHECK-SSE1-NEXT: movb %bpl, 25(%r11)
+; CHECK-SSE1-NEXT: movb %bl, 25(%r11)
; CHECK-SSE1-NEXT: movb %r14b, 24(%r11)
; CHECK-SSE1-NEXT: movb %r15b, 23(%r11)
; CHECK-SSE1-NEXT: movb %r12b, 22(%r11)
@@ -1761,51 +1761,51 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: pushq %rbx
; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r15d
; CHECK-BASELINE-NEXT: movzwl 16(%rdx), %r14d
-; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %r11d
-; CHECK-BASELINE-NEXT: movzwl 12(%rdx), %r13d
-; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r9d
-; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r8d
-; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %ebx
-; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r12d
-; CHECK-BASELINE-NEXT: movzwl (%rdx), %ebp
-; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r10d
+; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %ebp
+; CHECK-BASELINE-NEXT: movzwl 12(%rdx), %ebx
+; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r13d
+; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r11d
+; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r10d
+; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r9d
+; CHECK-BASELINE-NEXT: movzwl (%rdx), %r8d
+; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r12d
; CHECK-BASELINE-NEXT: movzwl (%rsi), %eax
-; CHECK-BASELINE-NEXT: xorw %bp, %ax
+; CHECK-BASELINE-NEXT: xorw %r8w, %ax
; CHECK-BASELINE-NEXT: andw (%rcx), %ax
-; CHECK-BASELINE-NEXT: xorl %eax, %ebp
-; CHECK-BASELINE-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: xorl %eax, %r8d
+; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %eax
-; CHECK-BASELINE-NEXT: xorw %r10w, %ax
+; CHECK-BASELINE-NEXT: xorw %r12w, %ax
; CHECK-BASELINE-NEXT: andw 2(%rcx), %ax
-; CHECK-BASELINE-NEXT: xorl %eax, %r10d
+; CHECK-BASELINE-NEXT: xorl %eax, %r12d
; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax
-; CHECK-BASELINE-NEXT: xorw %r12w, %ax
+; CHECK-BASELINE-NEXT: xorw %r9w, %ax
; CHECK-BASELINE-NEXT: andw 4(%rcx), %ax
-; CHECK-BASELINE-NEXT: xorl %eax, %r12d
+; CHECK-BASELINE-NEXT: xorl %eax, %r9d
+; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %eax
-; CHECK-BASELINE-NEXT: xorw %bx, %ax
+; CHECK-BASELINE-NEXT: xorw %r10w, %ax
; CHECK-BASELINE-NEXT: andw 6(%rcx), %ax
-; CHECK-BASELINE-NEXT: xorl %eax, %ebx
-; CHECK-BASELINE-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: xorl %eax, %r10d
+; CHECK-BASELINE-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax
-; CHECK-BASELINE-NEXT: xorw %r8w, %ax
+; CHECK-BASELINE-NEXT: xorw %r11w, %ax
; CHECK-BASELINE-NEXT: andw 8(%rcx), %ax
-; CHECK-BASELINE-NEXT: xorl %eax, %r8d
-; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: xorl %eax, %r11d
+; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax
-; CHECK-BASELINE-NEXT: xorw %r9w, %ax
+; CHECK-BASELINE-NEXT: xorw %r13w, %ax
; CHECK-BASELINE-NEXT: andw 10(%rcx), %ax
-; CHECK-BASELINE-NEXT: xorl %eax, %r9d
-; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: xorl %eax, %r13d
+; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %eax
-; CHECK-BASELINE-NEXT: xorw %r13w, %ax
+; CHECK-BASELINE-NEXT: xorw %bx, %ax
; CHECK-BASELINE-NEXT: andw 12(%rcx), %ax
-; CHECK-BASELINE-NEXT: xorl %eax, %r13d
+; CHECK-BASELINE-NEXT: xorl %eax, %ebx
; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %eax
-; CHECK-BASELINE-NEXT: xorw %r11w, %ax
+; CHECK-BASELINE-NEXT: xorw %bp, %ax
; CHECK-BASELINE-NEXT: andw 14(%rcx), %ax
-; CHECK-BASELINE-NEXT: xorl %eax, %r11d
-; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: xorl %eax, %ebp
; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %eax
; CHECK-BASELINE-NEXT: xorw %r14w, %ax
; CHECK-BASELINE-NEXT: andw 16(%rcx), %ax
@@ -1814,11 +1814,11 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: xorw %r15w, %ax
; CHECK-BASELINE-NEXT: andw 18(%rcx), %ax
; CHECK-BASELINE-NEXT: xorl %eax, %r15d
-; CHECK-BASELINE-NEXT: movzwl 20(%rdx), %ebx
+; CHECK-BASELINE-NEXT: movzwl 20(%rdx), %r13d
; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %eax
-; CHECK-BASELINE-NEXT: xorw %bx, %ax
+; CHECK-BASELINE-NEXT: xorw %r13w, %ax
; CHECK-BASELINE-NEXT: andw 20(%rcx), %ax
-; CHECK-BASELINE-NEXT: xorl %eax, %ebx
+; CHECK-BASELINE-NEXT: xorl %eax, %r13d
; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %r9d
; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %eax
; CHECK-BASELINE-NEXT: xorw %r9w, %ax
@@ -1830,39 +1830,39 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: andw 24(%rcx), %ax
; CHECK-BASELINE-NEXT: xorl %eax, %r8d
; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %eax
-; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r11d
-; CHECK-BASELINE-NEXT: xorw %ax, %r11w
-; CHECK-BASELINE-NEXT: andw 26(%rcx), %r11w
-; CHECK-BASELINE-NEXT: xorl %r11d, %eax
-; CHECK-BASELINE-NEXT: movzwl 28(%rdx), %r11d
-; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %ebp
-; CHECK-BASELINE-NEXT: xorw %r11w, %bp
-; CHECK-BASELINE-NEXT: andw 28(%rcx), %bp
-; CHECK-BASELINE-NEXT: xorl %ebp, %r11d
+; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r10d
+; CHECK-BASELINE-NEXT: xorw %ax, %r10w
+; CHECK-BASELINE-NEXT: andw 26(%rcx), %r10w
+; CHECK-BASELINE-NEXT: xorl %r10d, %eax
+; CHECK-BASELINE-NEXT: movzwl 28(%rdx), %r10d
+; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %r11d
+; CHECK-BASELINE-NEXT: xorw %r10w, %r11w
+; CHECK-BASELINE-NEXT: andw 28(%rcx), %r11w
+; CHECK-BASELINE-NEXT: xorl %r11d, %r10d
; CHECK-BASELINE-NEXT: movzwl 30(%rdx), %edx
; CHECK-BASELINE-NEXT: movzwl 30(%rsi), %esi
; CHECK-BASELINE-NEXT: xorw %dx, %si
; CHECK-BASELINE-NEXT: andw 30(%rcx), %si
; CHECK-BASELINE-NEXT: xorl %esi, %edx
; CHECK-BASELINE-NEXT: movw %dx, 30(%rdi)
-; CHECK-BASELINE-NEXT: movw %r11w, 28(%rdi)
+; CHECK-BASELINE-NEXT: movw %r10w, 28(%rdi)
; CHECK-BASELINE-NEXT: movw %ax, 26(%rdi)
; CHECK-BASELINE-NEXT: movw %r8w, 24(%rdi)
; CHECK-BASELINE-NEXT: movw %r9w, 22(%rdi)
-; CHECK-BASELINE-NEXT: movw %bx, 20(%rdi)
+; CHECK-BASELINE-NEXT: movw %r13w, 20(%rdi)
; CHECK-BASELINE-NEXT: movw %r15w, 18(%rdi)
; CHECK-BASELINE-NEXT: movw %r14w, 16(%rdi)
-; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT: movw %ax, 14(%rdi)
-; CHECK-BASELINE-NEXT: movw %r13w, 12(%rdi)
+; CHECK-BASELINE-NEXT: movw %bp, 14(%rdi)
+; CHECK-BASELINE-NEXT: movw %bx, 12(%rdi)
; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT: movw %ax, 10(%rdi)
; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT: movw %ax, 8(%rdi)
; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT: movw %ax, 6(%rdi)
-; CHECK-BASELINE-NEXT: movw %r12w, 4(%rdi)
-; CHECK-BASELINE-NEXT: movw %r10w, 2(%rdi)
+; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT: movw %ax, 4(%rdi)
+; CHECK-BASELINE-NEXT: movw %r12w, 2(%rdi)
; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT: movw %ax, (%rdi)
; CHECK-BASELINE-NEXT: movq %rdi, %rax
@@ -1884,51 +1884,51 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: pushq %rbx
; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r15d
; CHECK-SSE1-NEXT: movzwl 16(%rdx), %r14d
-; CHECK-SSE1-NEXT: movzwl 14(%rdx), %r11d
-; CHECK-SSE1-NEXT: movzwl 12(%rdx), %r13d
-; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r9d
-; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r8d
-; CHECK-SSE1-NEXT: movzwl 6(%rdx), %ebx
-; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r12d
-; CHECK-SSE1-NEXT: movzwl (%rdx), %ebp
-; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r10d
+; CHECK-SSE1-NEXT: movzwl 14(%rdx), %ebp
+; CHECK-SSE1-NEXT: movzwl 12(%rdx), %ebx
+; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r13d
+; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r11d
+; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r10d
+; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r9d
+; CHECK-SSE1-NEXT: movzwl (%rdx), %r8d
+; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r12d
; CHECK-SSE1-NEXT: movzwl (%rsi), %eax
-; CHECK-SSE1-NEXT: xorw %bp, %ax
+; CHECK-SSE1-NEXT: xorw %r8w, %ax
; CHECK-SSE1-NEXT: andw (%rcx), %ax
-; CHECK-SSE1-NEXT: xorl %eax, %ebp
-; CHECK-SSE1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: xorl %eax, %r8d
+; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT: movzwl 2(%rsi), %eax
-; CHECK-SSE1-NEXT: xorw %r10w, %ax
+; CHECK-SSE1-NEXT: xorw %r12w, %ax
; CHECK-SSE1-NEXT: andw 2(%rcx), %ax
-; CHECK-SSE1-NEXT: xorl %eax, %r10d
+; CHECK-SSE1-NEXT: xorl %eax, %r12d
; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax
-; CHECK-SSE1-NEXT: xorw %r12w, %ax
+; CHECK-SSE1-NEXT: xorw %r9w, %ax
; CHECK-SSE1-NEXT: andw 4(%rcx), %ax
-; CHECK-SSE1-NEXT: xorl %eax, %r12d
+; CHECK-SSE1-NEXT: xorl %eax, %r9d
+; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT: movzwl 6(%rsi), %eax
-; CHECK-SSE1-NEXT: xorw %bx, %ax
+; CHECK-SSE1-NEXT: xorw %r10w, %ax
; CHECK-SSE1-NEXT: andw 6(%rcx), %ax
-; CHECK-SSE1-NEXT: xorl %eax, %ebx
-; CHECK-SSE1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: xorl %eax, %r10d
+; CHECK-SSE1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax
-; CHECK-SSE1-NEXT: xorw %r8w, %ax
+; CHECK-SSE1-NEXT: xorw %r11w, %ax
; CHECK-SSE1-NEXT: andw 8(%rcx), %ax
-; CHECK-SSE1-NEXT: xorl %eax, %r8d
-; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: xorl %eax, %r11d
+; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax
-; CHECK-SSE1-NEXT: xorw %r9w, %ax
+; CHECK-SSE1-NEXT: xorw %r13w, %ax
; CHECK-SSE1-NEXT: andw 10(%rcx), %ax
-; CHECK-SSE1-NEXT: xorl %eax, %r9d
-; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: xorl %eax, %r13d
+; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT: movzwl 12(%rsi), %eax
-; CHECK-SSE1-NEXT: xorw %r13w, %ax
+; CHECK-SSE1-NEXT: xorw %bx, %ax
; CHECK-SSE1-NEXT: andw 12(%rcx), %ax
-; CHECK-SSE1-NEXT: xorl %eax, %r13d
+; CHECK-SSE1-NEXT: xorl %eax, %ebx
; CHECK-SSE1-NEXT: movzwl 14(%rsi), %eax
-; CHECK-SSE1-NEXT: xorw %r11w, %ax
+; CHECK-SSE1-NEXT: xorw %bp, %ax
; CHECK-SSE1-NEXT: andw 14(%rcx), %ax
-; CHECK-SSE1-NEXT: xorl %eax, %r11d
-; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: xorl %eax, %ebp
; CHECK-SSE1-NEXT: movzwl 16(%rsi), %eax
; CHECK-SSE1-NEXT: xorw %r14w, %ax
; CHECK-SSE1-NEXT: andw 16(%rcx), %ax
@@ -1937,11 +1937,11 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: xorw %r15w, %ax
; CHECK-SSE1-NEXT: andw 18(%rcx), %ax
; CHECK-SSE1-NEXT: xorl %eax, %r15d
-; CHECK-SSE1-NEXT: movzwl 20(%rdx), %ebx
+; CHECK-SSE1-NEXT: movzwl 20(%rdx), %r13d
; CHECK-SSE1-NEXT: movzwl 20(%rsi), %eax
-; CHECK-SSE1-NEXT: xorw %bx, %ax
+; CHECK-SSE1-NEXT: xorw %r13w, %ax
; CHECK-SSE1-NEXT: andw 20(%rcx), %ax
-; CHECK-SSE1-NEXT: xorl %eax, %ebx
+; CHECK-SSE1-NEXT: xorl %eax, %r13d
; CHECK-SSE1-NEXT: movzwl 22(%rdx), %r9d
; CHECK-SSE1-NEXT: movzwl 22(%rsi), %eax
; CHECK-SSE1-NEXT: xorw %r9w, %ax
@@ -1953,39 +1953,39 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: andw 24(%rcx), %ax
; CHECK-SSE1-NEXT: xorl %eax, %r8d
; CHECK-SSE1-NEXT: movzwl 26(%rdx), %eax
-; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r11d
-; CHECK-SSE1-NEXT: xorw %ax, %r11w
-; CHECK-SSE1-NEXT: andw 26(%rcx), %r11w
-; CHECK-SSE1-NEXT: xorl %r11d, %eax
-; CHECK-SSE1-NEXT: movzwl 28(%rdx), %r11d
-; CHECK-SSE1-NEXT: movzwl 28(%rsi), %ebp
-; CHECK-SSE1-NEXT: xorw %r11w, %bp
-; CHECK-SSE1-NEXT: andw 28(%rcx), %bp
-; CHECK-SSE1-NEXT: xorl %ebp, %r11d
+; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r10d
+; CHECK-SSE1-NEXT: xorw %ax, %r10w
+; CHECK-SSE1-NEXT: andw 26(%rcx), %r10w
+; CHECK-SSE1-NEXT: xorl %r10d, %eax
+; CHECK-SSE1-NEXT: movzwl 28(%rdx), %r10d
+; CHECK-SSE1-NEXT: movzwl 28(%rsi), %r11d
+; CHECK-SSE1-NEXT: xorw %r10w, %r11w
+; CHECK-SSE1-NEXT: andw 28(%rcx), %r11w
+; CHECK-SSE1-NEXT: xorl %r11d, %r10d
; CHECK-SSE1-NEXT: movzwl 30(%rdx), %edx
; CHECK-SSE1-NEXT: movzwl 30(%rsi), %esi
; CHECK-SSE1-NEXT: xorw %dx, %si
; CHECK-SSE1-NEXT: andw 30(%rcx), %si
; CHECK-SSE1-NEXT: xorl %esi, %edx
; CHECK-SSE1-NEXT: movw %dx, 30(%rdi)
-; CHECK-SSE1-NEXT: movw %r11w, 28(%rdi)
+; CHECK-SSE1-NEXT: movw %r10w, 28(%rdi)
; CHECK-SSE1-NEXT: movw %ax, 26(%rdi)
; CHECK-SSE1-NEXT: movw %r8w, 24(%rdi)
; CHECK-SSE1-NEXT: movw %r9w, 22(%rdi)
-; CHECK-SSE1-NEXT: movw %bx, 20(%rdi)
+; CHECK-SSE1-NEXT: movw %r13w, 20(%rdi)
; CHECK-SSE1-NEXT: movw %r15w, 18(%rdi)
; CHECK-SSE1-NEXT: movw %r14w, 16(%rdi)
-; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT: movw %ax, 14(%rdi)
-; CHECK-SSE1-NEXT: movw %r13w, 12(%rdi)
+; CHECK-SSE1-NEXT: movw %bp, 14(%rdi)
+; CHECK-SSE1-NEXT: movw %bx, 12(%rdi)
; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT: movw %ax, 10(%rdi)
; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT: movw %ax, 8(%rdi)
; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT: movw %ax, 6(%rdi)
-; CHECK-SSE1-NEXT: movw %r12w, 4(%rdi)
-; CHECK-SSE1-NEXT: movw %r10w, 2(%rdi)
+; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT: movw %ax, 4(%rdi)
+; CHECK-SSE1-NEXT: movw %r12w, 2(%rdi)
; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT: movw %ax, (%rdi)
; CHECK-SSE1-NEXT: movq %rdi, %rax
@@ -2031,126 +2031,118 @@ define <8 x i32> @out_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: out_v8i32:
; CHECK-BASELINE: # %bb.0:
; CHECK-BASELINE-NEXT: pushq %rbp
-; CHECK-BASELINE-NEXT: pushq %r15
; CHECK-BASELINE-NEXT: pushq %r14
-; CHECK-BASELINE-NEXT: pushq %r12
; CHECK-BASELINE-NEXT: pushq %rbx
; CHECK-BASELINE-NEXT: movq %rdi, %rax
-; CHECK-BASELINE-NEXT: movl 28(%rdx), %r8d
-; CHECK-BASELINE-NEXT: movl 24(%rdx), %r9d
+; CHECK-BASELINE-NEXT: movl 28(%rdx), %edi
+; CHECK-BASELINE-NEXT: movl 24(%rdx), %r8d
; CHECK-BASELINE-NEXT: movl 20(%rdx), %r10d
-; CHECK-BASELINE-NEXT: movl 16(%rdx), %r14d
-; CHECK-BASELINE-NEXT: movl 12(%rdx), %ebx
+; CHECK-BASELINE-NEXT: movl 16(%rdx), %ebx
+; CHECK-BASELINE-NEXT: movl 12(%rdx), %r14d
; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebp
-; CHECK-BASELINE-NEXT: movl (%rdx), %edi
-; CHECK-BASELINE-NEXT: movl 4(%rdx), %edx
-; CHECK-BASELINE-NEXT: movl (%rsi), %r11d
-; CHECK-BASELINE-NEXT: xorl %edi, %r11d
-; CHECK-BASELINE-NEXT: andl (%rcx), %r11d
-; CHECK-BASELINE-NEXT: xorl %edi, %r11d
-; CHECK-BASELINE-NEXT: movl 4(%rsi), %r15d
-; CHECK-BASELINE-NEXT: xorl %edx, %r15d
-; CHECK-BASELINE-NEXT: andl 4(%rcx), %r15d
-; CHECK-BASELINE-NEXT: xorl %edx, %r15d
-; CHECK-BASELINE-NEXT: movl 8(%rsi), %r12d
-; CHECK-BASELINE-NEXT: xorl %ebp, %r12d
-; CHECK-BASELINE-NEXT: andl 8(%rcx), %r12d
-; CHECK-BASELINE-NEXT: xorl %ebp, %r12d
-; CHECK-BASELINE-NEXT: movl 12(%rsi), %ebp
-; CHECK-BASELINE-NEXT: xorl %ebx, %ebp
-; CHECK-BASELINE-NEXT: andl 12(%rcx), %ebp
-; CHECK-BASELINE-NEXT: xorl %ebx, %ebp
-; CHECK-BASELINE-NEXT: movl 16(%rsi), %ebx
-; CHECK-BASELINE-NEXT: xorl %r14d, %ebx
-; CHECK-BASELINE-NEXT: andl 16(%rcx), %ebx
-; CHECK-BASELINE-NEXT: xorl %r14d, %ebx
-; CHECK-BASELINE-NEXT: movl 20(%rsi), %edi
-; CHECK-BASELINE-NEXT: xorl %r10d, %edi
-; CHECK-BASELINE-NEXT: andl 20(%rcx), %edi
-; CHECK-BASELINE-NEXT: xorl %r10d, %edi
-; CHECK-BASELINE-NEXT: movl 24(%rsi), %edx
+; CHECK-BASELINE-NEXT: movl (%rdx), %r9d
+; CHECK-BASELINE-NEXT: movl 4(%rdx), %r11d
+; CHECK-BASELINE-NEXT: movl (%rsi), %edx
; CHECK-BASELINE-NEXT: xorl %r9d, %edx
-; CHECK-BASELINE-NEXT: andl 24(%rcx), %edx
+; CHECK-BASELINE-NEXT: andl (%rcx), %edx
; CHECK-BASELINE-NEXT: xorl %r9d, %edx
+; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d
+; CHECK-BASELINE-NEXT: xorl %r11d, %r9d
+; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d
+; CHECK-BASELINE-NEXT: xorl %r11d, %r9d
+; CHECK-BASELINE-NEXT: movl 8(%rsi), %r11d
+; CHECK-BASELINE-NEXT: xorl %ebp, %r11d
+; CHECK-BASELINE-NEXT: andl 8(%rcx), %r11d
+; CHECK-BASELINE-NEXT: xorl %ebp, %r11d
+; CHECK-BASELINE-NEXT: movl 12(%rsi), %ebp
+; CHECK-BASELINE-NEXT: xorl %r14d, %ebp
+; CHECK-BASELINE-NEXT: andl 12(%rcx), %ebp
+; CHECK-BASELINE-NEXT: xorl %r14d, %ebp
+; CHECK-BASELINE-NEXT: movl 16(%rsi), %r14d
+; CHECK-BASELINE-NEXT: xorl %ebx, %r14d
+; CHECK-BASELINE-NEXT: andl 16(%rcx), %r14d
+; CHECK-BASELINE-NEXT: xorl %ebx, %r14d
+; CHECK-BASELINE-NEXT: movl 20(%rsi), %ebx
+; CHECK-BASELINE-NEXT: xorl %r10d, %ebx
+; CHECK-BASELINE-NEXT: andl 20(%rcx), %ebx
+; CHECK-BASELINE-NEXT: xorl %r10d, %ebx
+; CHECK-BASELINE-NEXT: movl 24(%rsi), %r10d
+; CHECK-BASELINE-NEXT: xorl %r8d, %r10d
+; CHECK-BASELINE-NEXT: andl 24(%rcx), %r10d
+; CHECK-BASELINE-NEXT: xorl %r8d, %r10d
; CHECK-BASELINE-NEXT: movl 28(%rsi), %esi
-; CHECK-BASELINE-NEXT: xorl %r8d, %esi
+; CHECK-BASELINE-NEXT: xorl %edi, %esi
; CHECK-BASELINE-NEXT: andl 28(%rcx), %esi
-; CHECK-BASELINE-NEXT: xorl %r8d, %esi
+; CHECK-BASELINE-NEXT: xorl %edi, %esi
; CHECK-BASELINE-NEXT: movl %esi, 28(%rax)
-; CHECK-BASELINE-NEXT: movl %edx, 24(%rax)
-; CHECK-BASELINE-NEXT: movl %edi, 20(%rax)
-; CHECK-BASELINE-NEXT: movl %ebx, 16(%rax)
+; CHECK-BASELINE-NEXT: movl %r10d, 24(%rax)
+; CHECK-BASELINE-NEXT: movl %ebx, 20(%rax)
+; CHECK-BASELINE-NEXT: movl %r14d, 16(%rax)
; CHECK-BASELINE-NEXT: movl %ebp, 12(%rax)
-; CHECK-BASELINE-NEXT: movl %r12d, 8(%rax)
-; CHECK-BASELINE-NEXT: movl %r15d, 4(%rax)
-; CHECK-BASELINE-NEXT: movl %r11d, (%rax)
+; CHECK-BASELINE-NEXT: movl %r11d, 8(%rax)
+; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax)
+; CHECK-BASELINE-NEXT: movl %edx, (%rax)
; CHECK-BASELINE-NEXT: popq %rbx
-; CHECK-BASELINE-NEXT: popq %r12
; CHECK-BASELINE-NEXT: popq %r14
-; CHECK-BASELINE-NEXT: popq %r15
; CHECK-BASELINE-NEXT: popq %rbp
; CHECK-BASELINE-NEXT: retq
;
; CHECK-SSE1-LABEL: out_v8i32:
; CHECK-SSE1: # %bb.0:
; CHECK-SSE1-NEXT: pushq %rbp
-; CHECK-SSE1-NEXT: pushq %r15
; CHECK-SSE1-NEXT: pushq %r14
-; CHECK-SSE1-NEXT: pushq %r12
; CHECK-SSE1-NEXT: pushq %rbx
; CHECK-SSE1-NEXT: movq %rdi, %rax
-; CHECK-SSE1-NEXT: movl 28(%rdx), %r8d
-; CHECK-SSE1-NEXT: movl 24(%rdx), %r9d
+; CHECK-SSE1-NEXT: movl 28(%rdx), %edi
+; CHECK-SSE1-NEXT: movl 24(%rdx), %r8d
; CHECK-SSE1-NEXT: movl 20(%rdx), %r10d
-; CHECK-SSE1-NEXT: movl 16(%rdx), %r14d
-; CHECK-SSE1-NEXT: movl 12(%rdx), %ebx
+; CHECK-SSE1-NEXT: movl 16(%rdx), %ebx
+; CHECK-SSE1-NEXT: movl 12(%rdx), %r14d
; CHECK-SSE1-NEXT: movl 8(%rdx), %ebp
-; CHECK-SSE1-NEXT: movl (%rdx), %edi
-; CHECK-SSE1-NEXT: movl 4(%rdx), %edx
-; CHECK-SSE1-NEXT: movl (%rsi), %r11d
-; CHECK-SSE1-NEXT: xorl %edi, %r11d
-; CHECK-SSE1-NEXT: andl (%rcx), %r11d
-; CHECK-SSE1-NEXT: xorl %edi, %r11d
-; CHECK-SSE1-NEXT: movl 4(%rsi), %r15d
-; CHECK-SSE1-NEXT: xorl %edx, %r15d
-; CHECK-SSE1-NEXT: andl 4(%rcx), %r15d
-; CHECK-SSE1-NEXT: xorl %edx, %r15d
-; CHECK-SSE1-NEXT: movl 8(%rsi), %r12d
-; CHECK-SSE1-NEXT: xorl %ebp, %r12d
-; CHECK-SSE1-NEXT: andl 8(%rcx), %r12d
-; CHECK-SSE1-NEXT: xorl %ebp, %r12d
-; CHECK-SSE1-NEXT: movl 12(%rsi), %ebp
-; CHECK-SSE1-NEXT: xorl %ebx, %ebp
-; CHECK-SSE1-NEXT: andl 12(%rcx), %ebp
-; CHECK-SSE1-NEXT: xorl %ebx, %ebp
-; CHECK-SSE1-NEXT: movl 16(%rsi), %ebx
-; CHECK-SSE1-NEXT: xorl %r14d, %ebx
-; CHECK-SSE1-NEXT: andl 16(%rcx), %ebx
-; CHECK-SSE1-NEXT: xorl %r14d, %ebx
-; CHECK-SSE1-NEXT: movl 20(%rsi), %edi
-; CHECK-SSE1-NEXT: xorl %r10d, %edi
-; CHECK-SSE1-NEXT: andl 20(%rcx), %edi
-; CHECK-SSE1-NEXT: xorl %r10d, %edi
-; CHECK-SSE1-NEXT: movl 24(%rsi), %edx
+; CHECK-SSE1-NEXT: movl (%rdx), %r9d
+; CHECK-SSE1-NEXT: movl 4(%rdx), %r11d
+; CHECK-SSE1-NEXT: movl (%rsi), %edx
; CHECK-SSE1-NEXT: xorl %r9d, %edx
-; CHECK-SSE1-NEXT: andl 24(%rcx), %edx
+; CHECK-SSE1-NEXT: andl (%rcx), %edx
; CHECK-SSE1-NEXT: xorl %r9d, %edx
+; CHECK-SSE1-NEXT: movl 4(%rsi), %r9d
+; CHECK-SSE1-NEXT: xorl %r11d, %r9d
+; CHECK-SSE1-NEXT: andl 4(%rcx), %r9d
+; CHECK-SSE1-NEXT: xorl %r11d, %r9d
+; CHECK-SSE1-NEXT: movl 8(%rsi), %r11d
+; CHECK-SSE1-NEXT: xorl %ebp, %r11d
+; CHECK-SSE1-NEXT: andl 8(%rcx), %r11d
+; CHECK-SSE1-NEXT: xorl %ebp, %r11d
+; CHECK-SSE1-NEXT: movl 12(%rsi), %ebp
+; CHECK-SSE1-NEXT: xorl %r14d, %ebp
+; CHECK-SSE1-NEXT: andl 12(%rcx), %ebp
+; CHECK-SSE1-NEXT: xorl %r14d, %ebp
+; CHECK-SSE1-NEXT: movl 16(%rsi), %r14d
+; CHECK-SSE1-NEXT: xorl %ebx, %r14d
+; CHECK-SSE1-NEXT: andl 16(%rcx), %r14d
+; CHECK-SSE1-NEXT: xorl %ebx, %r14d
+; CHECK-SSE1-NEXT: movl 20(%rsi), %ebx
+; CHECK-SSE1-NEXT: xorl %r10d, %ebx
+; CHECK-SSE1-NEXT: andl 20(%rcx), %ebx
+; CHECK-SSE1-NEXT: xorl %r10d, %ebx
+; CHECK-SSE1-NEXT: movl 24(%rsi), %r10d
+; CHECK-SSE1-NEXT: xorl %r8d, %r10d
+; CHECK-SSE1-NEXT: andl 24(%rcx), %r10d
+; CHECK-SSE1-NEXT: xorl %r8d, %r10d
; CHECK-SSE1-NEXT: movl 28(%rsi), %esi
-; CHECK-SSE1-NEXT: xorl %r8d, %esi
+; CHECK-SSE1-NEXT: xorl %edi, %esi
; CHECK-SSE1-NEXT: andl 28(%rcx), %esi
-; CHECK-SSE1-NEXT: xorl %r8d, %esi
+; CHECK-SSE1-NEXT: xorl %edi, %esi
; CHECK-SSE1-NEXT: movl %esi, 28(%rax)
-; CHECK-SSE1-NEXT: movl %edx, 24(%rax)
-; CHECK-SSE1-NEXT: movl %edi, 20(%rax)
-; CHECK-SSE1-NEXT: movl %ebx, 16(%rax)
+; CHECK-SSE1-NEXT: movl %r10d, 24(%rax)
+; CHECK-SSE1-NEXT: movl %ebx, 20(%rax)
+; CHECK-SSE1-NEXT: movl %r14d, 16(%rax)
; CHECK-SSE1-NEXT: movl %ebp, 12(%rax)
-; CHECK-SSE1-NEXT: movl %r12d, 8(%rax)
-; CHECK-SSE1-NEXT: movl %r15d, 4(%rax)
-; CHECK-SSE1-NEXT: movl %r11d, (%rax)
+; CHECK-SSE1-NEXT: movl %r11d, 8(%rax)
+; CHECK-SSE1-NEXT: movl %r9d, 4(%rax)
+; CHECK-SSE1-NEXT: movl %edx, (%rax)
; CHECK-SSE1-NEXT: popq %rbx
-; CHECK-SSE1-NEXT: popq %r12
; CHECK-SSE1-NEXT: popq %r14
-; CHECK-SSE1-NEXT: popq %r15
; CHECK-SSE1-NEXT: popq %rbp
; CHECK-SSE1-NEXT: retq
;
@@ -2188,59 +2180,59 @@ define <4 x i64> @out_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: out_v4i64:
; CHECK-BASELINE: # %bb.0:
; CHECK-BASELINE-NEXT: movq %rdi, %rax
-; CHECK-BASELINE-NEXT: movq 24(%rdx), %r8
-; CHECK-BASELINE-NEXT: movq 16(%rdx), %r9
-; CHECK-BASELINE-NEXT: movq (%rdx), %rdi
+; CHECK-BASELINE-NEXT: movq 24(%rdx), %rdi
+; CHECK-BASELINE-NEXT: movq 16(%rdx), %r8
+; CHECK-BASELINE-NEXT: movq (%rdx), %r9
; CHECK-BASELINE-NEXT: movq 8(%rdx), %r10
-; CHECK-BASELINE-NEXT: movq (%rsi), %r11
-; CHECK-BASELINE-NEXT: xorq %rdi, %r11
-; CHECK-BASELINE-NEXT: andq (%rcx), %r11
-; CHECK-BASELINE-NEXT: xorq %rdi, %r11
-; CHECK-BASELINE-NEXT: movq 8(%rsi), %rdi
-; CHECK-BASELINE-NEXT: xorq %r10, %rdi
-; CHECK-BASELINE-NEXT: andq 8(%rcx), %rdi
-; CHECK-BASELINE-NEXT: xorq %r10, %rdi
-; CHECK-BASELINE-NEXT: movq 16(%rsi), %rdx
+; CHECK-BASELINE-NEXT: movq (%rsi), %rdx
; CHECK-BASELINE-NEXT: xorq %r9, %rdx
-; CHECK-BASELINE-NEXT: andq 16(%rcx), %rdx
+; CHECK-BASELINE-NEXT: andq (%rcx), %rdx
; CHECK-BASELINE-NEXT: xorq %r9, %rdx
+; CHECK-BASELINE-NEXT: movq 8(%rsi), %r9
+; CHECK-BASELINE-NEXT: xorq %r10, %r9
+; CHECK-BASELINE-NEXT: andq 8(%rcx), %r9
+; CHECK-BASELINE-NEXT: xorq %r10, %r9
+; CHECK-BASELINE-NEXT: movq 16(%rsi), %r10
+; CHECK-BASELINE-NEXT: xorq %r8, %r10
+; CHECK-BASELINE-NEXT: andq 16(%rcx), %r10
+; CHECK-BASELINE-NEXT: xorq %r8, %r10
; CHECK-BASELINE-NEXT: movq 24(%rsi), %rsi
-; CHECK-BASELINE-NEXT: xorq %r8, %rsi
+; CHECK-BASELINE-NEXT: xorq %rdi, %rsi
; CHECK-BASELINE-NEXT: andq 24(%rcx), %rsi
-; CHECK-BASELINE-NEXT: xorq %r8, %rsi
+; CHECK-BASELINE-NEXT: xorq %rdi, %rsi
; CHECK-BASELINE-NEXT: movq %rsi, 24(%rax)
-; CHECK-BASELINE-NEXT: movq %rdx, 16(%rax)
-; CHECK-BASELINE-NEXT: movq %rdi, 8(%rax)
-; CHECK-BASELINE-NEXT: movq %r11, (%rax)
+; CHECK-BASELINE-NEXT: movq %r10, 16(%rax)
+; CHECK-BASELINE-NEXT: movq %r9, 8(%rax)
+; CHECK-BASELINE-NEXT: movq %rdx, (%rax)
; CHECK-BASELINE-NEXT: retq
;
; CHECK-SSE1-LABEL: out_v4i64:
; CHECK-SSE1: # %bb.0:
; CHECK-SSE1-NEXT: movq %rdi, %rax
-; CHECK-SSE1-NEXT: movq 24(%rdx), %r8
-; CHECK-SSE1-NEXT: movq 16(%rdx), %r9
-; CHECK-SSE1-NEXT: movq (%rdx), %rdi
+; CHECK-SSE1-NEXT: movq 24(%rdx), %rdi
+; CHECK-SSE1-NEXT: movq 16(%rdx), %r8
+; CHECK-SSE1-NEXT: movq (%rdx), %r9
; CHECK-SSE1-NEXT: movq 8(%rdx), %r10
-; CHECK-SSE1-NEXT: movq (%rsi), %r11
-; CHECK-SSE1-NEXT: xorq %rdi, %r11
-; CHECK-SSE1-NEXT: andq (%rcx), %r11
-; CHECK-SSE1-NEXT: xorq %rdi, %r11
-; CHECK-SSE1-NEXT: movq 8(%rsi), %rdi
-; CHECK-SSE1-NEXT: xorq %r10, %rdi
-; CHECK-SSE1-NEXT: andq 8(%rcx), %rdi
-; CHECK-SSE1-NEXT: xorq %r10, %rdi
-; CHECK-SSE1-NEXT: movq 16(%rsi), %rdx
+; CHECK-SSE1-NEXT: movq (%rsi), %rdx
; CHECK-SSE1-NEXT: xorq %r9, %rdx
-; CHECK-SSE1-NEXT: andq 16(%rcx), %rdx
+; CHECK-SSE1-NEXT: andq (%rcx), %rdx
; CHECK-SSE1-NEXT: xorq %r9, %rdx
+; CHECK-SSE1-NEXT: movq 8(%rsi), %r9
+; CHECK-SSE1-NEXT: xorq %r10, %r9
+; CHECK-SSE1-NEXT: andq 8(%rcx), %r9
+; CHECK-SSE1-NEXT: xorq %r10, %r9
+; CHECK-SSE1-NEXT: movq 16(%rsi), %r10
+; CHECK-SSE1-NEXT: xorq %r8, %r10
+; CHECK-SSE1-NEXT: andq 16(%rcx), %r10
+; CHECK-SSE1-NEXT: xorq %r8, %r10
; CHECK-SSE1-NEXT: movq 24(%rsi), %rsi
-; CHECK-SSE1-NEXT: xorq %r8, %rsi
+; CHECK-SSE1-NEXT: xorq %rdi, %rsi
; CHECK-SSE1-NEXT: andq 24(%rcx), %rsi
-; CHECK-SSE1-NEXT: xorq %r8, %rsi
+; CHECK-SSE1-NEXT: xorq %rdi, %rsi
; CHECK-SSE1-NEXT: movq %rsi, 24(%rax)
-; CHECK-SSE1-NEXT: movq %rdx, 16(%rax)
-; CHECK-SSE1-NEXT: movq %rdi, 8(%rax)
-; CHECK-SSE1-NEXT: movq %r11, (%rax)
+; CHECK-SSE1-NEXT: movq %r10, 16(%rax)
+; CHECK-SSE1-NEXT: movq %r9, 8(%rax)
+; CHECK-SSE1-NEXT: movq %rdx, (%rax)
; CHECK-SSE1-NEXT: retq
;
; CHECK-SSE2-LABEL: out_v4i64:
@@ -2501,20 +2493,20 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-BASELINE-NEXT: pushq %r12
; CHECK-BASELINE-NEXT: pushq %rbx
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT: xorb %r11b, %sil
; CHECK-BASELINE-NEXT: xorb %r12b, %dl
-; CHECK-BASELINE-NEXT: xorb %r15b, %cl
-; CHECK-BASELINE-NEXT: xorb %r14b, %r8b
-; CHECK-BASELINE-NEXT: xorb %bpl, %r9b
+; CHECK-BASELINE-NEXT: xorb %r14b, %cl
+; CHECK-BASELINE-NEXT: xorb %bpl, %r8b
+; CHECK-BASELINE-NEXT: xorb %bl, %r9b
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r15b
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r13b
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %bl
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT: xorb %r10b, %al
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b
@@ -2523,19 +2515,19 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b
; CHECK-BASELINE-NEXT: xorb %r11b, %sil
; CHECK-BASELINE-NEXT: xorb %r12b, %dl
-; CHECK-BASELINE-NEXT: xorb %r15b, %cl
-; CHECK-BASELINE-NEXT: xorb %r14b, %r8b
-; CHECK-BASELINE-NEXT: xorb %bpl, %r9b
+; CHECK-BASELINE-NEXT: xorb %r14b, %cl
+; CHECK-BASELINE-NEXT: xorb %bpl, %r8b
+; CHECK-BASELINE-NEXT: xorb %bl, %r9b
+; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r15b
; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r13b
-; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %bl
; CHECK-BASELINE-NEXT: xorb %r10b, %al
; CHECK-BASELINE-NEXT: movb %al, 7(%rdi)
-; CHECK-BASELINE-NEXT: movb %bl, 6(%rdi)
-; CHECK-BASELINE-NEXT: movb %r13b, 5(%rdi)
+; CHECK-BASELINE-NEXT: movb %r13b, 6(%rdi)
+; CHECK-BASELINE-NEXT: movb %r15b, 5(%rdi)
; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi)
; CHECK-BASELINE-NEXT: movb %r8b, 3(%rdi)
; CHECK-BASELINE-NEXT: movb %cl, 2(%rdi)
@@ -2559,20 +2551,20 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-SSE1-NEXT: pushq %r12
; CHECK-SSE1-NEXT: pushq %rbx
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT: xorb %r11b, %sil
; CHECK-SSE1-NEXT: xorb %r12b, %dl
-; CHECK-SSE1-NEXT: xorb %r15b, %cl
-; CHECK-SSE1-NEXT: xorb %r14b, %r8b
-; CHECK-SSE1-NEXT: xorb %bpl, %r9b
+; CHECK-SSE1-NEXT: xorb %r14b, %cl
+; CHECK-SSE1-NEXT: xorb %bpl, %r8b
+; CHECK-SSE1-NEXT: xorb %bl, %r9b
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r15b
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r13b
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %bl
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT: xorb %r10b, %al
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b
@@ -2581,19 +2573,19 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b
; CHECK-SSE1-NEXT: xorb %r11b, %sil
; CHECK-SSE1-NEXT: xorb %r12b, %dl
-; CHECK-SSE1-NEXT: xorb %r15b, %cl
-; CHECK-SSE1-NEXT: xorb %r14b, %r8b
-; CHECK-SSE1-NEXT: xorb %bpl, %r9b
+; CHECK-SSE1-NEXT: xorb %r14b, %cl
+; CHECK-SSE1-NEXT: xorb %bpl, %r8b
+; CHECK-SSE1-NEXT: xorb %bl, %r9b
+; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r15b
; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r13b
-; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %bl
; CHECK-SSE1-NEXT: xorb %r10b, %al
; CHECK-SSE1-NEXT: movb %al, 7(%rdi)
-; CHECK-SSE1-NEXT: movb %bl, 6(%rdi)
-; CHECK-SSE1-NEXT: movb %r13b, 5(%rdi)
+; CHECK-SSE1-NEXT: movb %r13b, 6(%rdi)
+; CHECK-SSE1-NEXT: movb %r15b, 5(%rdi)
; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi)
; CHECK-SSE1-NEXT: movb %r8b, 3(%rdi)
; CHECK-SSE1-NEXT: movb %cl, 2(%rdi)
@@ -2629,21 +2621,21 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind
; CHECK-BASELINE-LABEL: in_v4i16:
; CHECK-BASELINE: # %bb.0:
; CHECK-BASELINE-NEXT: movq %rdi, %rax
+; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT: xorl %r9d, %esi
-; CHECK-BASELINE-NEXT: xorl %edi, %edx
-; CHECK-BASELINE-NEXT: xorl %r11d, %ecx
-; CHECK-BASELINE-NEXT: xorl %r10d, %r8d
+; CHECK-BASELINE-NEXT: xorl %r11d, %edx
+; CHECK-BASELINE-NEXT: xorl %r10d, %ecx
+; CHECK-BASELINE-NEXT: xorl %edi, %r8d
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si
; CHECK-BASELINE-NEXT: xorl %r9d, %esi
-; CHECK-BASELINE-NEXT: xorl %edi, %edx
-; CHECK-BASELINE-NEXT: xorl %r11d, %ecx
-; CHECK-BASELINE-NEXT: xorl %r10d, %r8d
+; CHECK-BASELINE-NEXT: xorl %r11d, %edx
+; CHECK-BASELINE-NEXT: xorl %r10d, %ecx
+; CHECK-BASELINE-NEXT: xorl %edi, %r8d
; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax)
; CHECK-BASELINE-NEXT: movw %cx, 4(%rax)
; CHECK-BASELINE-NEXT: movw %dx, 2(%rax)
@@ -2653,21 +2645,21 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind
; CHECK-SSE1-LABEL: in_v4i16:
; CHECK-SSE1: # %bb.0:
; CHECK-SSE1-NEXT: movq %rdi, %rax
+; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT: xorl %r9d, %esi
-; CHECK-SSE1-NEXT: xorl %edi, %edx
-; CHECK-SSE1-NEXT: xorl %r11d, %ecx
-; CHECK-SSE1-NEXT: xorl %r10d, %r8d
+; CHECK-SSE1-NEXT: xorl %r11d, %edx
+; CHECK-SSE1-NEXT: xorl %r10d, %ecx
+; CHECK-SSE1-NEXT: xorl %edi, %r8d
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si
; CHECK-SSE1-NEXT: xorl %r9d, %esi
-; CHECK-SSE1-NEXT: xorl %edi, %edx
-; CHECK-SSE1-NEXT: xorl %r11d, %ecx
-; CHECK-SSE1-NEXT: xorl %r10d, %r8d
+; CHECK-SSE1-NEXT: xorl %r11d, %edx
+; CHECK-SSE1-NEXT: xorl %r10d, %ecx
+; CHECK-SSE1-NEXT: xorl %edi, %r8d
; CHECK-SSE1-NEXT: movw %r8w, 6(%rax)
; CHECK-SSE1-NEXT: movw %cx, 4(%rax)
; CHECK-SSE1-NEXT: movw %dx, 2(%rax)
@@ -2767,12 +2759,12 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
@@ -2788,13 +2780,9 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b
; CHECK-BASELINE-NEXT: xorb %r11b, %r10b
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT: xorb %bl, %r11b
+; CHECK-BASELINE-NEXT: xorb %r13b, %r11b
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT: xorb %bl, %r11b
-; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT: xorb %r13b, %bl
-; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT: xorb %r13b, %bl
+; CHECK-BASELINE-NEXT: xorb %r13b, %r11b
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-BASELINE-NEXT: xorb %r12b, %r13b
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b
@@ -2812,9 +2800,13 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b
; CHECK-BASELINE-NEXT: xorb %bpl, %r14b
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT: xorb %al, %bpl
+; CHECK-BASELINE-NEXT: xorb %bl, %bpl
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-BASELINE-NEXT: xorb %al, %bpl
+; CHECK-BASELINE-NEXT: xorb %bl, %bpl
+; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT: xorb %al, %bl
+; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT: xorb %al, %bl
; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT: xorb %cl, %al
; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al
@@ -2825,12 +2817,12 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
; CHECK-BASELINE-NEXT: xorb %sil, %cl
; CHECK-BASELINE-NEXT: movb %cl, 15(%rdx)
; CHECK-BASELINE-NEXT: movb %al, 14(%rdx)
-; CHECK-BASELINE-NEXT: movb %bpl, 13(%rdx)
-; CHECK-BASELINE-NEXT: movb %r14b, 12(%rdx)
-; CHECK-BASELINE-NEXT: movb %r15b, 11(%rdx)
-; CHECK-BASELINE-NEXT: movb %r12b, 10(%rdx)
-; CHECK-BASELINE-NEXT: movb %r13b, 9(%rdx)
-; CHECK-BASELINE-NEXT: movb %bl, 8(%rdx)
+; CHECK-BASELINE-NEXT: movb %bl, 13(%rdx)
+; CHECK-BASELINE-NEXT: movb %bpl, 12(%rdx)
+; CHECK-BASELINE-NEXT: movb %r14b, 11(%rdx)
+; CHECK-BASELINE-NEXT: movb %r15b, 10(%rdx)
+; CHECK-BASELINE-NEXT: movb %r12b, 9(%rdx)
+; CHECK-BASELINE-NEXT: movb %r13b, 8(%rdx)
; CHECK-BASELINE-NEXT: movb %r11b, 7(%rdx)
; CHECK-BASELINE-NEXT: movb %r10b, 6(%rdx)
; CHECK-BASELINE-NEXT: movb %dil, 5(%rdx)
@@ -2882,12 +2874,12 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
@@ -2903,13 +2895,9 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b
; CHECK-SSE1-NEXT: xorb %r11b, %r10b
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT: xorb %bl, %r11b
+; CHECK-SSE1-NEXT: xorb %r13b, %r11b
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT: xorb %bl, %r11b
-; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT: xorb %r13b, %bl
-; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT: xorb %r13b, %bl
+; CHECK-SSE1-NEXT: xorb %r13b, %r11b
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-SSE1-NEXT: xorb %r12b, %r13b
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b
@@ -2927,9 +2915,13 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b
; CHECK-SSE1-NEXT: xorb %bpl, %r14b
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT: xorb %al, %bpl
+; CHECK-SSE1-NEXT: xorb %bl, %bpl
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-SSE1-NEXT: xorb %al, %bpl
+; CHECK-SSE1-NEXT: xorb %bl, %bpl
+; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT: xorb %al, %bl
+; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT: xorb %al, %bl
; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT: xorb %cl, %al
; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al
@@ -2940,12 +2932,12 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
; CHECK-SSE1-NEXT: xorb %sil, %cl
; CHECK-SSE1-NEXT: movb %cl, 15(%rdx)
; CHECK-SSE1-NEXT: movb %al, 14(%rdx)
-; CHECK-SSE1-NEXT: movb %bpl, 13(%rdx)
-; CHECK-SSE1-NEXT: movb %r14b, 12(%rdx)
-; CHECK-SSE1-NEXT: movb %r15b, 11(%rdx)
-; CHECK-SSE1-NEXT: movb %r12b, 10(%rdx)
-; CHECK-SSE1-NEXT: movb %r13b, 9(%rdx)
-; CHECK-SSE1-NEXT: movb %bl, 8(%rdx)
+; CHECK-SSE1-NEXT: movb %bl, 13(%rdx)
+; CHECK-SSE1-NEXT: movb %bpl, 12(%rdx)
+; CHECK-SSE1-NEXT: movb %r14b, 11(%rdx)
+; CHECK-SSE1-NEXT: movb %r15b, 10(%rdx)
+; CHECK-SSE1-NEXT: movb %r12b, 9(%rdx)
+; CHECK-SSE1-NEXT: movb %r13b, 8(%rdx)
; CHECK-SSE1-NEXT: movb %r11b, 7(%rdx)
; CHECK-SSE1-NEXT: movb %r10b, 6(%rdx)
; CHECK-SSE1-NEXT: movb %dil, 5(%rdx)
@@ -3002,12 +2994,11 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
; CHECK-BASELINE-LABEL: in_v8i16:
; CHECK-BASELINE: # %bb.0:
-; CHECK-BASELINE-NEXT: pushq %rbp
; CHECK-BASELINE-NEXT: pushq %rbx
; CHECK-BASELINE-NEXT: movq %rdi, %rax
+; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT: xorl %ebx, %esi
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si
@@ -3028,38 +3019,36 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind
; CHECK-BASELINE-NEXT: xorl %ebx, %r9d
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w
; CHECK-BASELINE-NEXT: xorl %ebx, %r9d
-; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT: xorw %di, %bp
-; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bp
-; CHECK-BASELINE-NEXT: xorl %edi, %ebp
-; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT: xorw %r11w, %di
-; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di
-; CHECK-BASELINE-NEXT: xorl %r11d, %edi
; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT: xorw %r10w, %bx
+; CHECK-BASELINE-NEXT: xorw %r11w, %bx
; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx
-; CHECK-BASELINE-NEXT: xorl %r10d, %ebx
-; CHECK-BASELINE-NEXT: movw %bx, 14(%rax)
-; CHECK-BASELINE-NEXT: movw %di, 12(%rax)
-; CHECK-BASELINE-NEXT: movw %bp, 10(%rax)
+; CHECK-BASELINE-NEXT: xorl %r11d, %ebx
+; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT: xorw %r10w, %r11w
+; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w
+; CHECK-BASELINE-NEXT: xorl %r10d, %r11d
+; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT: xorw %di, %r10w
+; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w
+; CHECK-BASELINE-NEXT: xorl %edi, %r10d
+; CHECK-BASELINE-NEXT: movw %r10w, 14(%rax)
+; CHECK-BASELINE-NEXT: movw %r11w, 12(%rax)
+; CHECK-BASELINE-NEXT: movw %bx, 10(%rax)
; CHECK-BASELINE-NEXT: movw %r9w, 8(%rax)
; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax)
; CHECK-BASELINE-NEXT: movw %cx, 4(%rax)
; CHECK-BASELINE-NEXT: movw %dx, 2(%rax)
; CHECK-BASELINE-NEXT: movw %si, (%rax)
; CHECK-BASELINE-NEXT: popq %rbx
-; CHECK-BASELINE-NEXT: popq %rbp
; CHECK-BASELINE-NEXT: retq
;
; CHECK-SSE1-LABEL: in_v8i16:
; CHECK-SSE1: # %bb.0:
-; CHECK-SSE1-NEXT: pushq %rbp
; CHECK-SSE1-NEXT: pushq %rbx
; CHECK-SSE1-NEXT: movq %rdi, %rax
+; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT: xorl %ebx, %esi
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si
@@ -3080,28 +3069,27 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind
; CHECK-SSE1-NEXT: xorl %ebx, %r9d
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w
; CHECK-SSE1-NEXT: xorl %ebx, %r9d
-; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT: xorw %di, %bp
-; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bp
-; CHECK-SSE1-NEXT: xorl %edi, %ebp
-; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT: xorw %r11w, %di
-; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di
-; CHECK-SSE1-NEXT: xorl %r11d, %edi
; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT: xorw %r10w, %bx
+; CHECK-SSE1-NEXT: xorw %r11w, %bx
; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx
-; CHECK-SSE1-NEXT: xorl %r10d, %ebx
-; CHECK-SSE1-NEXT: movw %bx, 14(%rax)
-; CHECK-SSE1-NEXT: movw %di, 12(%rax)
-; CHECK-SSE1-NEXT: movw %bp, 10(%rax)
+; CHECK-SSE1-NEXT: xorl %r11d, %ebx
+; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT: xorw %r10w, %r11w
+; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w
+; CHECK-SSE1-NEXT: xorl %r10d, %r11d
+; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT: xorw %di, %r10w
+; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w
+; CHECK-SSE1-NEXT: xorl %edi, %r10d
+; CHECK-SSE1-NEXT: movw %r10w, 14(%rax)
+; CHECK-SSE1-NEXT: movw %r11w, 12(%rax)
+; CHECK-SSE1-NEXT: movw %bx, 10(%rax)
; CHECK-SSE1-NEXT: movw %r9w, 8(%rax)
; CHECK-SSE1-NEXT: movw %r8w, 6(%rax)
; CHECK-SSE1-NEXT: movw %cx, 4(%rax)
; CHECK-SSE1-NEXT: movw %dx, 2(%rax)
; CHECK-SSE1-NEXT: movw %si, (%rax)
; CHECK-SSE1-NEXT: popq %rbx
-; CHECK-SSE1-NEXT: popq %rbp
; CHECK-SSE1-NEXT: retq
;
; CHECK-SSE2-LABEL: in_v8i16:
@@ -3126,29 +3114,29 @@ define <4 x i32> @in_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE: # %bb.0:
; CHECK-BASELINE-NEXT: pushq %rbx
; CHECK-BASELINE-NEXT: movq %rdi, %rax
-; CHECK-BASELINE-NEXT: movl 12(%rdx), %r8d
-; CHECK-BASELINE-NEXT: movl 8(%rdx), %r9d
-; CHECK-BASELINE-NEXT: movl (%rdx), %r11d
+; CHECK-BASELINE-NEXT: movl 12(%rdx), %edi
+; CHECK-BASELINE-NEXT: movl 8(%rdx), %r8d
+; CHECK-BASELINE-NEXT: movl (%rdx), %r9d
; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d
; CHECK-BASELINE-NEXT: movl (%rsi), %edx
-; CHECK-BASELINE-NEXT: xorl %r11d, %edx
-; CHECK-BASELINE-NEXT: movl 4(%rsi), %edi
-; CHECK-BASELINE-NEXT: xorl %r10d, %edi
+; CHECK-BASELINE-NEXT: xorl %r9d, %edx
+; CHECK-BASELINE-NEXT: movl 4(%rsi), %r11d
+; CHECK-BASELINE-NEXT: xorl %r10d, %r11d
; CHECK-BASELINE-NEXT: movl 8(%rsi), %ebx
-; CHECK-BASELINE-NEXT: xorl %r9d, %ebx
+; CHECK-BASELINE-NEXT: xorl %r8d, %ebx
; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi
-; CHECK-BASELINE-NEXT: xorl %r8d, %esi
+; CHECK-BASELINE-NEXT: xorl %edi, %esi
; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi
; CHECK-BASELINE-NEXT: andl 8(%rcx), %ebx
-; CHECK-BASELINE-NEXT: andl 4(%rcx), %edi
+; CHECK-BASELINE-NEXT: andl 4(%rcx), %r11d
; CHECK-BASELINE-NEXT: andl (%rcx), %edx
-; CHECK-BASELINE-NEXT: xorl %r11d, %edx
-; CHECK-BASELINE-NEXT: xorl %r10d, %edi
-; CHECK-BASELINE-NEXT: xorl %r9d, %ebx
-; CHECK-BASELINE-NEXT: xorl %r8d, %esi
+; CHECK-BASELINE-NEXT: xorl %r9d, %edx
+; CHECK-BASELINE-NEXT: xorl %r10d, %r11d
+; CHECK-BASELINE-NEXT: xorl %r8d, %ebx
+; CHECK-BASELINE-NEXT: xorl %edi, %esi
; CHECK-BASELINE-NEXT: movl %esi, 12(%rax)
; CHECK-BASELINE-NEXT: movl %ebx, 8(%rax)
-; CHECK-BASELINE-NEXT: movl %edi, 4(%rax)
+; CHECK-BASELINE-NEXT: movl %r11d, 4(%rax)
; CHECK-BASELINE-NEXT: movl %edx, (%rax)
; CHECK-BASELINE-NEXT: popq %rbx
; CHECK-BASELINE-NEXT: retq
@@ -3244,9 +3232,9 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: pushq %r12
; CHECK-BASELINE-NEXT: pushq %rbx
; CHECK-BASELINE-NEXT: movq %rdx, %r13
-; CHECK-BASELINE-NEXT: movq %rsi, %rbx
+; CHECK-BASELINE-NEXT: movq %rsi, %r12
; CHECK-BASELINE-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %r12d
+; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %r15d
; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 13(%rdx), %eax
@@ -3257,200 +3245,200 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r9d
-; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r10d
-; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r11d
-; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r8d
+; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r8d
+; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r9d
+; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r10d
+; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r11d
; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %ebp
-; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi
-; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edi
-; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %r14d
-; CHECK-BASELINE-NEXT: movzbl (%rdx), %eax
-; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %r15d
-; CHECK-BASELINE-NEXT: movzbl (%rbx), %edx
-; CHECK-BASELINE-NEXT: xorb %al, %dl
-; CHECK-BASELINE-NEXT: andb (%rcx), %dl
-; CHECK-BASELINE-NEXT: xorb %al, %dl
-; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 1(%rbx), %eax
-; CHECK-BASELINE-NEXT: xorb %r15b, %al
+; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %edi
+; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %esi
+; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %edx
+; CHECK-BASELINE-NEXT: movzbl (%r13), %eax
+; CHECK-BASELINE-NEXT: movzbl 1(%r13), %ebx
+; CHECK-BASELINE-NEXT: movzbl (%r12), %r14d
+; CHECK-BASELINE-NEXT: xorb %al, %r14b
+; CHECK-BASELINE-NEXT: andb (%rcx), %r14b
+; CHECK-BASELINE-NEXT: xorb %al, %r14b
+; CHECK-BASELINE-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 1(%r12), %eax
+; CHECK-BASELINE-NEXT: xorb %bl, %al
; CHECK-BASELINE-NEXT: andb 1(%rcx), %al
-; CHECK-BASELINE-NEXT: xorb %r15b, %al
+; CHECK-BASELINE-NEXT: xorb %bl, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 2(%rbx), %eax
-; CHECK-BASELINE-NEXT: xorb %r14b, %al
+; CHECK-BASELINE-NEXT: movzbl 2(%r12), %eax
+; CHECK-BASELINE-NEXT: xorb %dl, %al
; CHECK-BASELINE-NEXT: andb 2(%rcx), %al
-; CHECK-BASELINE-NEXT: xorb %r14b, %al
+; CHECK-BASELINE-NEXT: xorb %dl, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 3(%rbx), %eax
-; CHECK-BASELINE-NEXT: xorb %dil, %al
+; CHECK-BASELINE-NEXT: movzbl 3(%r12), %eax
+; CHECK-BASELINE-NEXT: xorb %sil, %al
; CHECK-BASELINE-NEXT: andb 3(%rcx), %al
-; CHECK-BASELINE-NEXT: xorb %dil, %al
-; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 4(%rbx), %eax
; CHECK-BASELINE-NEXT: xorb %sil, %al
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 4(%r12), %eax
+; CHECK-BASELINE-NEXT: xorb %dil, %al
; CHECK-BASELINE-NEXT: andb 4(%rcx), %al
-; CHECK-BASELINE-NEXT: xorb %sil, %al
+; CHECK-BASELINE-NEXT: xorb %dil, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 5(%rbx), %eax
+; CHECK-BASELINE-NEXT: movzbl 5(%r12), %eax
; CHECK-BASELINE-NEXT: xorb %bpl, %al
; CHECK-BASELINE-NEXT: andb 5(%rcx), %al
; CHECK-BASELINE-NEXT: xorb %bpl, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 6(%rbx), %eax
-; CHECK-BASELINE-NEXT: xorb %r8b, %al
-; CHECK-BASELINE-NEXT: andb 6(%rcx), %al
-; CHECK-BASELINE-NEXT: xorb %r8b, %al
-; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 7(%rbx), %eax
+; CHECK-BASELINE-NEXT: movzbl 6(%r12), %eax
; CHECK-BASELINE-NEXT: xorb %r11b, %al
-; CHECK-BASELINE-NEXT: andb 7(%rcx), %al
+; CHECK-BASELINE-NEXT: andb 6(%rcx), %al
; CHECK-BASELINE-NEXT: xorb %r11b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 8(%rbx), %eax
+; CHECK-BASELINE-NEXT: movzbl 7(%r12), %eax
; CHECK-BASELINE-NEXT: xorb %r10b, %al
-; CHECK-BASELINE-NEXT: andb 8(%rcx), %al
+; CHECK-BASELINE-NEXT: andb 7(%rcx), %al
; CHECK-BASELINE-NEXT: xorb %r10b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 9(%rbx), %eax
+; CHECK-BASELINE-NEXT: movzbl 8(%r12), %eax
; CHECK-BASELINE-NEXT: xorb %r9b, %al
-; CHECK-BASELINE-NEXT: andb 9(%rcx), %al
+; CHECK-BASELINE-NEXT: andb 8(%rcx), %al
; CHECK-BASELINE-NEXT: xorb %r9b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 10(%rbx), %edx
+; CHECK-BASELINE-NEXT: movzbl 9(%r12), %eax
+; CHECK-BASELINE-NEXT: xorb %r8b, %al
+; CHECK-BASELINE-NEXT: andb 9(%rcx), %al
+; CHECK-BASELINE-NEXT: xorb %r8b, %al
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 10(%r12), %edx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: andb 10(%rcx), %dl
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 11(%rbx), %edx
+; CHECK-BASELINE-NEXT: movzbl 11(%r12), %edx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: andb 11(%rcx), %dl
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 12(%rbx), %edx
+; CHECK-BASELINE-NEXT: movzbl 12(%r12), %edx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: andb 12(%rcx), %dl
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 13(%rbx), %edx
+; CHECK-BASELINE-NEXT: movzbl 13(%r12), %edx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: andb 13(%rcx), %dl
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 14(%rbx), %edx
+; CHECK-BASELINE-NEXT: movzbl 14(%r12), %edx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: andb 14(%rcx), %dl
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 15(%rbx), %eax
-; CHECK-BASELINE-NEXT: xorb %r12b, %al
+; CHECK-BASELINE-NEXT: movzbl 15(%r12), %eax
+; CHECK-BASELINE-NEXT: xorb %r15b, %al
; CHECK-BASELINE-NEXT: andb 15(%rcx), %al
-; CHECK-BASELINE-NEXT: xorb %r12b, %al
+; CHECK-BASELINE-NEXT: xorb %r15b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 16(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 16(%rbx), %edx
+; CHECK-BASELINE-NEXT: movzbl 16(%r12), %edx
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: andb 16(%rcx), %dl
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 17(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 17(%rbx), %edx
+; CHECK-BASELINE-NEXT: movzbl 17(%r12), %edx
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: andb 17(%rcx), %dl
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 18(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 18(%rbx), %edx
+; CHECK-BASELINE-NEXT: movzbl 18(%r12), %edx
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: andb 18(%rcx), %dl
; CHECK-BASELINE-NEXT: xorb %al, %dl
; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 19(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 19(%rbx), %r12d
-; CHECK-BASELINE-NEXT: xorb %al, %r12b
-; CHECK-BASELINE-NEXT: andb 19(%rcx), %r12b
-; CHECK-BASELINE-NEXT: xorb %al, %r12b
-; CHECK-BASELINE-NEXT: movzbl 20(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 20(%rbx), %r15d
+; CHECK-BASELINE-NEXT: movzbl 19(%r12), %r15d
; CHECK-BASELINE-NEXT: xorb %al, %r15b
-; CHECK-BASELINE-NEXT: andb 20(%rcx), %r15b
-; CHECK-BASELINE-NEXT: movq %rcx, %rsi
+; CHECK-BASELINE-NEXT: andb 19(%rcx), %r15b
+; CHECK-BASELINE-NEXT: movq %rcx, %rdx
; CHECK-BASELINE-NEXT: xorb %al, %r15b
-; CHECK-BASELINE-NEXT: movzbl 21(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 21(%rbx), %r14d
+; CHECK-BASELINE-NEXT: movzbl 20(%r13), %eax
+; CHECK-BASELINE-NEXT: movzbl 20(%r12), %r14d
; CHECK-BASELINE-NEXT: xorb %al, %r14b
-; CHECK-BASELINE-NEXT: andb 21(%rcx), %r14b
+; CHECK-BASELINE-NEXT: andb 20(%rcx), %r14b
; CHECK-BASELINE-NEXT: xorb %al, %r14b
-; CHECK-BASELINE-NEXT: movzbl 22(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 22(%rbx), %ebp
+; CHECK-BASELINE-NEXT: movzbl 21(%r13), %eax
+; CHECK-BASELINE-NEXT: movzbl 21(%r12), %ebp
; CHECK-BASELINE-NEXT: xorb %al, %bpl
-; CHECK-BASELINE-NEXT: andb 22(%rcx), %bpl
+; CHECK-BASELINE-NEXT: andb 21(%rcx), %bpl
; CHECK-BASELINE-NEXT: xorb %al, %bpl
+; CHECK-BASELINE-NEXT: movzbl 22(%r13), %eax
+; CHECK-BASELINE-NEXT: movzbl 22(%r12), %ebx
+; CHECK-BASELINE-NEXT: xorb %al, %bl
+; CHECK-BASELINE-NEXT: andb 22(%rcx), %bl
+; CHECK-BASELINE-NEXT: xorb %al, %bl
; CHECK-BASELINE-NEXT: movzbl 23(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 23(%rbx), %r11d
+; CHECK-BASELINE-NEXT: movzbl 23(%r12), %r11d
; CHECK-BASELINE-NEXT: xorb %al, %r11b
; CHECK-BASELINE-NEXT: andb 23(%rcx), %r11b
; CHECK-BASELINE-NEXT: xorb %al, %r11b
; CHECK-BASELINE-NEXT: movzbl 24(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 24(%rbx), %r10d
+; CHECK-BASELINE-NEXT: movzbl 24(%r12), %r10d
; CHECK-BASELINE-NEXT: xorb %al, %r10b
; CHECK-BASELINE-NEXT: andb 24(%rcx), %r10b
; CHECK-BASELINE-NEXT: xorb %al, %r10b
; CHECK-BASELINE-NEXT: movzbl 25(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 25(%rbx), %r9d
+; CHECK-BASELINE-NEXT: movzbl 25(%r12), %r9d
; CHECK-BASELINE-NEXT: xorb %al, %r9b
; CHECK-BASELINE-NEXT: andb 25(%rcx), %r9b
; CHECK-BASELINE-NEXT: xorb %al, %r9b
; CHECK-BASELINE-NEXT: movzbl 26(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 26(%rbx), %r8d
+; CHECK-BASELINE-NEXT: movzbl 26(%r12), %r8d
; CHECK-BASELINE-NEXT: xorb %al, %r8b
; CHECK-BASELINE-NEXT: andb 26(%rcx), %r8b
; CHECK-BASELINE-NEXT: xorb %al, %r8b
; CHECK-BASELINE-NEXT: movzbl 27(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 27(%rbx), %edi
+; CHECK-BASELINE-NEXT: movzbl 27(%r12), %edi
; CHECK-BASELINE-NEXT: xorb %al, %dil
; CHECK-BASELINE-NEXT: andb 27(%rcx), %dil
; CHECK-BASELINE-NEXT: xorb %al, %dil
; CHECK-BASELINE-NEXT: movzbl 28(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 28(%rbx), %edx
-; CHECK-BASELINE-NEXT: xorb %al, %dl
-; CHECK-BASELINE-NEXT: andb 28(%rcx), %dl
-; CHECK-BASELINE-NEXT: xorb %al, %dl
+; CHECK-BASELINE-NEXT: movzbl 28(%r12), %esi
+; CHECK-BASELINE-NEXT: xorb %al, %sil
+; CHECK-BASELINE-NEXT: andb 28(%rcx), %sil
+; CHECK-BASELINE-NEXT: xorb %al, %sil
; CHECK-BASELINE-NEXT: movzbl 29(%r13), %eax
-; CHECK-BASELINE-NEXT: movzbl 29(%rbx), %ecx
+; CHECK-BASELINE-NEXT: movzbl 29(%r12), %ecx
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 29(%rsi), %cl
+; CHECK-BASELINE-NEXT: andb 29(%rdx), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movzbl 30(%r13), %eax
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 30(%rbx), %eax
+; CHECK-BASELINE-NEXT: movzbl 30(%r12), %eax
; CHECK-BASELINE-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: andb 30(%rsi), %al
+; CHECK-BASELINE-NEXT: andb 30(%rdx), %al
; CHECK-BASELINE-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: movzbl 31(%r13), %r13d
-; CHECK-BASELINE-NEXT: movzbl 31(%rbx), %ebx
-; CHECK-BASELINE-NEXT: xorb %r13b, %bl
-; CHECK-BASELINE-NEXT: andb 31(%rsi), %bl
-; CHECK-BASELINE-NEXT: xorb %r13b, %bl
+; CHECK-BASELINE-NEXT: movzbl 31(%r12), %r12d
+; CHECK-BASELINE-NEXT: xorb %r13b, %r12b
+; CHECK-BASELINE-NEXT: andb 31(%rdx), %r12b
+; CHECK-BASELINE-NEXT: xorb %r13b, %r12b
; CHECK-BASELINE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; CHECK-BASELINE-NEXT: movb %bl, 31(%r13)
+; CHECK-BASELINE-NEXT: movb %r12b, 31(%r13)
; CHECK-BASELINE-NEXT: movb %al, 30(%r13)
; CHECK-BASELINE-NEXT: movb %cl, 29(%r13)
-; CHECK-BASELINE-NEXT: movb %dl, 28(%r13)
+; CHECK-BASELINE-NEXT: movb %sil, 28(%r13)
; CHECK-BASELINE-NEXT: movb %dil, 27(%r13)
; CHECK-BASELINE-NEXT: movb %r8b, 26(%r13)
; CHECK-BASELINE-NEXT: movb %r9b, 25(%r13)
; CHECK-BASELINE-NEXT: movb %r10b, 24(%r13)
; CHECK-BASELINE-NEXT: movb %r11b, 23(%r13)
-; CHECK-BASELINE-NEXT: movb %bpl, 22(%r13)
-; CHECK-BASELINE-NEXT: movb %r14b, 21(%r13)
-; CHECK-BASELINE-NEXT: movb %r15b, 20(%r13)
-; CHECK-BASELINE-NEXT: movb %r12b, 19(%r13)
+; CHECK-BASELINE-NEXT: movb %bl, 22(%r13)
+; CHECK-BASELINE-NEXT: movb %bpl, 21(%r13)
+; CHECK-BASELINE-NEXT: movb %r14b, 20(%r13)
+; CHECK-BASELINE-NEXT: movb %r15b, 19(%r13)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: movb %al, 18(%r13)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -3507,9 +3495,9 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: pushq %r12
; CHECK-SSE1-NEXT: pushq %rbx
; CHECK-SSE1-NEXT: movq %rdx, %r13
-; CHECK-SSE1-NEXT: movq %rsi, %rbx
+; CHECK-SSE1-NEXT: movq %rsi, %r12
; CHECK-SSE1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-SSE1-NEXT: movzbl 15(%rdx), %r12d
+; CHECK-SSE1-NEXT: movzbl 15(%rdx), %r15d
; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 13(%rdx), %eax
@@ -3520,200 +3508,200 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r9d
-; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r10d
-; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r11d
-; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r8d
+; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r8d
+; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r9d
+; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r10d
+; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r11d
; CHECK-SSE1-NEXT: movzbl 5(%rdx), %ebp
-; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi
-; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edi
-; CHECK-SSE1-NEXT: movzbl 2(%rdx), %r14d
-; CHECK-SSE1-NEXT: movzbl (%rdx), %eax
-; CHECK-SSE1-NEXT: movzbl 1(%rdx), %r15d
-; CHECK-SSE1-NEXT: movzbl (%rbx), %edx
-; CHECK-SSE1-NEXT: xorb %al, %dl
-; CHECK-SSE1-NEXT: andb (%rcx), %dl
-; CHECK-SSE1-NEXT: xorb %al, %dl
-; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 1(%rbx), %eax
-; CHECK-SSE1-NEXT: xorb %r15b, %al
+; CHECK-SSE1-NEXT: movzbl 4(%rdx), %edi
+; CHECK-SSE1-NEXT: movzbl 3(%rdx), %esi
+; CHECK-SSE1-NEXT: movzbl 2(%rdx), %edx
+; CHECK-SSE1-NEXT: movzbl (%r13), %eax
+; CHECK-SSE1-NEXT: movzbl 1(%r13), %ebx
+; CHECK-SSE1-NEXT: movzbl (%r12), %r14d
+; CHECK-SSE1-NEXT: xorb %al, %r14b
+; CHECK-SSE1-NEXT: andb (%rcx), %r14b
+; CHECK-SSE1-NEXT: xorb %al, %r14b
+; CHECK-SSE1-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 1(%r12), %eax
+; CHECK-SSE1-NEXT: xorb %bl, %al
; CHECK-SSE1-NEXT: andb 1(%rcx), %al
-; CHECK-SSE1-NEXT: xorb %r15b, %al
+; CHECK-SSE1-NEXT: xorb %bl, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 2(%rbx), %eax
-; CHECK-SSE1-NEXT: xorb %r14b, %al
+; CHECK-SSE1-NEXT: movzbl 2(%r12), %eax
+; CHECK-SSE1-NEXT: xorb %dl, %al
; CHECK-SSE1-NEXT: andb 2(%rcx), %al
-; CHECK-SSE1-NEXT: xorb %r14b, %al
+; CHECK-SSE1-NEXT: xorb %dl, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 3(%rbx), %eax
-; CHECK-SSE1-NEXT: xorb %dil, %al
+; CHECK-SSE1-NEXT: movzbl 3(%r12), %eax
+; CHECK-SSE1-NEXT: xorb %sil, %al
; CHECK-SSE1-NEXT: andb 3(%rcx), %al
-; CHECK-SSE1-NEXT: xorb %dil, %al
-; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 4(%rbx), %eax
; CHECK-SSE1-NEXT: xorb %sil, %al
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 4(%r12), %eax
+; CHECK-SSE1-NEXT: xorb %dil, %al
; CHECK-SSE1-NEXT: andb 4(%rcx), %al
-; CHECK-SSE1-NEXT: xorb %sil, %al
+; CHECK-SSE1-NEXT: xorb %dil, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 5(%rbx), %eax
+; CHECK-SSE1-NEXT: movzbl 5(%r12), %eax
; CHECK-SSE1-NEXT: xorb %bpl, %al
; CHECK-SSE1-NEXT: andb 5(%rcx), %al
; CHECK-SSE1-NEXT: xorb %bpl, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 6(%rbx), %eax
-; CHECK-SSE1-NEXT: xorb %r8b, %al
-; CHECK-SSE1-NEXT: andb 6(%rcx), %al
-; CHECK-SSE1-NEXT: xorb %r8b, %al
-; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 7(%rbx), %eax
+; CHECK-SSE1-NEXT: movzbl 6(%r12), %eax
; CHECK-SSE1-NEXT: xorb %r11b, %al
-; CHECK-SSE1-NEXT: andb 7(%rcx), %al
+; CHECK-SSE1-NEXT: andb 6(%rcx), %al
; CHECK-SSE1-NEXT: xorb %r11b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 8(%rbx), %eax
+; CHECK-SSE1-NEXT: movzbl 7(%r12), %eax
; CHECK-SSE1-NEXT: xorb %r10b, %al
-; CHECK-SSE1-NEXT: andb 8(%rcx), %al
+; CHECK-SSE1-NEXT: andb 7(%rcx), %al
; CHECK-SSE1-NEXT: xorb %r10b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 9(%rbx), %eax
+; CHECK-SSE1-NEXT: movzbl 8(%r12), %eax
; CHECK-SSE1-NEXT: xorb %r9b, %al
-; CHECK-SSE1-NEXT: andb 9(%rcx), %al
+; CHECK-SSE1-NEXT: andb 8(%rcx), %al
; CHECK-SSE1-NEXT: xorb %r9b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 10(%rbx), %edx
+; CHECK-SSE1-NEXT: movzbl 9(%r12), %eax
+; CHECK-SSE1-NEXT: xorb %r8b, %al
+; CHECK-SSE1-NEXT: andb 9(%rcx), %al
+; CHECK-SSE1-NEXT: xorb %r8b, %al
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 10(%r12), %edx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: andb 10(%rcx), %dl
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 11(%rbx), %edx
+; CHECK-SSE1-NEXT: movzbl 11(%r12), %edx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: andb 11(%rcx), %dl
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 12(%rbx), %edx
+; CHECK-SSE1-NEXT: movzbl 12(%r12), %edx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: andb 12(%rcx), %dl
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 13(%rbx), %edx
+; CHECK-SSE1-NEXT: movzbl 13(%r12), %edx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: andb 13(%rcx), %dl
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 14(%rbx), %edx
+; CHECK-SSE1-NEXT: movzbl 14(%r12), %edx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: andb 14(%rcx), %dl
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 15(%rbx), %eax
-; CHECK-SSE1-NEXT: xorb %r12b, %al
+; CHECK-SSE1-NEXT: movzbl 15(%r12), %eax
+; CHECK-SSE1-NEXT: xorb %r15b, %al
; CHECK-SSE1-NEXT: andb 15(%rcx), %al
-; CHECK-SSE1-NEXT: xorb %r12b, %al
+; CHECK-SSE1-NEXT: xorb %r15b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 16(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 16(%rbx), %edx
+; CHECK-SSE1-NEXT: movzbl 16(%r12), %edx
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: andb 16(%rcx), %dl
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 17(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 17(%rbx), %edx
+; CHECK-SSE1-NEXT: movzbl 17(%r12), %edx
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: andb 17(%rcx), %dl
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 18(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 18(%rbx), %edx
+; CHECK-SSE1-NEXT: movzbl 18(%r12), %edx
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: andb 18(%rcx), %dl
; CHECK-SSE1-NEXT: xorb %al, %dl
; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 19(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 19(%rbx), %r12d
-; CHECK-SSE1-NEXT: xorb %al, %r12b
-; CHECK-SSE1-NEXT: andb 19(%rcx), %r12b
-; CHECK-SSE1-NEXT: xorb %al, %r12b
-; CHECK-SSE1-NEXT: movzbl 20(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 20(%rbx), %r15d
+; CHECK-SSE1-NEXT: movzbl 19(%r12), %r15d
; CHECK-SSE1-NEXT: xorb %al, %r15b
-; CHECK-SSE1-NEXT: andb 20(%rcx), %r15b
-; CHECK-SSE1-NEXT: movq %rcx, %rsi
+; CHECK-SSE1-NEXT: andb 19(%rcx), %r15b
+; CHECK-SSE1-NEXT: movq %rcx, %rdx
; CHECK-SSE1-NEXT: xorb %al, %r15b
-; CHECK-SSE1-NEXT: movzbl 21(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 21(%rbx), %r14d
+; CHECK-SSE1-NEXT: movzbl 20(%r13), %eax
+; CHECK-SSE1-NEXT: movzbl 20(%r12), %r14d
; CHECK-SSE1-NEXT: xorb %al, %r14b
-; CHECK-SSE1-NEXT: andb 21(%rcx), %r14b
+; CHECK-SSE1-NEXT: andb 20(%rcx), %r14b
; CHECK-SSE1-NEXT: xorb %al, %r14b
-; CHECK-SSE1-NEXT: movzbl 22(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 22(%rbx), %ebp
+; CHECK-SSE1-NEXT: movzbl 21(%r13), %eax
+; CHECK-SSE1-NEXT: movzbl 21(%r12), %ebp
; CHECK-SSE1-NEXT: xorb %al, %bpl
-; CHECK-SSE1-NEXT: andb 22(%rcx), %bpl
+; CHECK-SSE1-NEXT: andb 21(%rcx), %bpl
; CHECK-SSE1-NEXT: xorb %al, %bpl
+; CHECK-SSE1-NEXT: movzbl 22(%r13), %eax
+; CHECK-SSE1-NEXT: movzbl 22(%r12), %ebx
+; CHECK-SSE1-NEXT: xorb %al, %bl
+; CHECK-SSE1-NEXT: andb 22(%rcx), %bl
+; CHECK-SSE1-NEXT: xorb %al, %bl
; CHECK-SSE1-NEXT: movzbl 23(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 23(%rbx), %r11d
+; CHECK-SSE1-NEXT: movzbl 23(%r12), %r11d
; CHECK-SSE1-NEXT: xorb %al, %r11b
; CHECK-SSE1-NEXT: andb 23(%rcx), %r11b
; CHECK-SSE1-NEXT: xorb %al, %r11b
; CHECK-SSE1-NEXT: movzbl 24(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 24(%rbx), %r10d
+; CHECK-SSE1-NEXT: movzbl 24(%r12), %r10d
; CHECK-SSE1-NEXT: xorb %al, %r10b
; CHECK-SSE1-NEXT: andb 24(%rcx), %r10b
; CHECK-SSE1-NEXT: xorb %al, %r10b
; CHECK-SSE1-NEXT: movzbl 25(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 25(%rbx), %r9d
+; CHECK-SSE1-NEXT: movzbl 25(%r12), %r9d
; CHECK-SSE1-NEXT: xorb %al, %r9b
; CHECK-SSE1-NEXT: andb 25(%rcx), %r9b
; CHECK-SSE1-NEXT: xorb %al, %r9b
; CHECK-SSE1-NEXT: movzbl 26(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 26(%rbx), %r8d
+; CHECK-SSE1-NEXT: movzbl 26(%r12), %r8d
; CHECK-SSE1-NEXT: xorb %al, %r8b
; CHECK-SSE1-NEXT: andb 26(%rcx), %r8b
; CHECK-SSE1-NEXT: xorb %al, %r8b
; CHECK-SSE1-NEXT: movzbl 27(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 27(%rbx), %edi
+; CHECK-SSE1-NEXT: movzbl 27(%r12), %edi
; CHECK-SSE1-NEXT: xorb %al, %dil
; CHECK-SSE1-NEXT: andb 27(%rcx), %dil
; CHECK-SSE1-NEXT: xorb %al, %dil
; CHECK-SSE1-NEXT: movzbl 28(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 28(%rbx), %edx
-; CHECK-SSE1-NEXT: xorb %al, %dl
-; CHECK-SSE1-NEXT: andb 28(%rcx), %dl
-; CHECK-SSE1-NEXT: xorb %al, %dl
+; CHECK-SSE1-NEXT: movzbl 28(%r12), %esi
+; CHECK-SSE1-NEXT: xorb %al, %sil
+; CHECK-SSE1-NEXT: andb 28(%rcx), %sil
+; CHECK-SSE1-NEXT: xorb %al, %sil
; CHECK-SSE1-NEXT: movzbl 29(%r13), %eax
-; CHECK-SSE1-NEXT: movzbl 29(%rbx), %ecx
+; CHECK-SSE1-NEXT: movzbl 29(%r12), %ecx
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 29(%rsi), %cl
+; CHECK-SSE1-NEXT: andb 29(%rdx), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movzbl 30(%r13), %eax
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 30(%rbx), %eax
+; CHECK-SSE1-NEXT: movzbl 30(%r12), %eax
; CHECK-SSE1-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: andb 30(%rsi), %al
+; CHECK-SSE1-NEXT: andb 30(%rdx), %al
; CHECK-SSE1-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload
; CHECK-SSE1-NEXT: movzbl 31(%r13), %r13d
-; CHECK-SSE1-NEXT: movzbl 31(%rbx), %ebx
-; CHECK-SSE1-NEXT: xorb %r13b, %bl
-; CHECK-SSE1-NEXT: andb 31(%rsi), %bl
-; CHECK-SSE1-NEXT: xorb %r13b, %bl
+; CHECK-SSE1-NEXT: movzbl 31(%r12), %r12d
+; CHECK-SSE1-NEXT: xorb %r13b, %r12b
+; CHECK-SSE1-NEXT: andb 31(%rdx), %r12b
+; CHECK-SSE1-NEXT: xorb %r13b, %r12b
; CHECK-SSE1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; CHECK-SSE1-NEXT: movb %bl, 31(%r13)
+; CHECK-SSE1-NEXT: movb %r12b, 31(%r13)
; CHECK-SSE1-NEXT: movb %al, 30(%r13)
; CHECK-SSE1-NEXT: movb %cl, 29(%r13)
-; CHECK-SSE1-NEXT: movb %dl, 28(%r13)
+; CHECK-SSE1-NEXT: movb %sil, 28(%r13)
; CHECK-SSE1-NEXT: movb %dil, 27(%r13)
; CHECK-SSE1-NEXT: movb %r8b, 26(%r13)
; CHECK-SSE1-NEXT: movb %r9b, 25(%r13)
; CHECK-SSE1-NEXT: movb %r10b, 24(%r13)
; CHECK-SSE1-NEXT: movb %r11b, 23(%r13)
-; CHECK-SSE1-NEXT: movb %bpl, 22(%r13)
-; CHECK-SSE1-NEXT: movb %r14b, 21(%r13)
-; CHECK-SSE1-NEXT: movb %r15b, 20(%r13)
-; CHECK-SSE1-NEXT: movb %r12b, 19(%r13)
+; CHECK-SSE1-NEXT: movb %bl, 22(%r13)
+; CHECK-SSE1-NEXT: movb %bpl, 21(%r13)
+; CHECK-SSE1-NEXT: movb %r14b, 20(%r13)
+; CHECK-SSE1-NEXT: movb %r15b, 19(%r13)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: movb %al, 18(%r13)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -3811,22 +3799,22 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %eax
; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movl 20(%rdx), %r11d
+; CHECK-BASELINE-NEXT: movl 20(%rdx), %r8d
+; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r11d
; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r14d
+; CHECK-BASELINE-NEXT: movl 16(%rdx), %ebx
+; CHECK-BASELINE-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %ebp
+; CHECK-BASELINE-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: movl 12(%rdx), %r14d
; CHECK-BASELINE-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movl 16(%rdx), %r15d
+; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r15d
; CHECK-BASELINE-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %r12d
+; CHECK-BASELINE-NEXT: movl 8(%rdx), %r12d
; CHECK-BASELINE-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movl 12(%rdx), %r13d
+; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r13d
; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r8d
-; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebx
-; CHECK-BASELINE-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %ebp
-; CHECK-BASELINE-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT: movl (%rdx), %ecx
; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT: movl 4(%rdx), %edi
@@ -3842,24 +3830,23 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax
; CHECK-BASELINE-NEXT: xorw %di, %ax
; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %edx
-; CHECK-BASELINE-NEXT: xorw %bp, %dx
-; CHECK-BASELINE-NEXT: movl %edx, %eax
-; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %ecx
-; CHECK-BASELINE-NEXT: xorw %bx, %cx
-; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %edx
-; CHECK-BASELINE-NEXT: xorw %r8w, %dx
-; CHECK-BASELINE-NEXT: movl %edx, %r8d
+; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %ecx
+; CHECK-BASELINE-NEXT: xorw %r13w, %cx
+; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax
+; CHECK-BASELINE-NEXT: xorw %r12w, %ax
+; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax
+; CHECK-BASELINE-NEXT: xorw %r15w, %ax
; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %edx
-; CHECK-BASELINE-NEXT: xorw %r13w, %dx
+; CHECK-BASELINE-NEXT: xorw %r14w, %dx
; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %r13d
-; CHECK-BASELINE-NEXT: xorw %r12w, %r13w
+; CHECK-BASELINE-NEXT: xorw %bp, %r13w
; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %r12d
-; CHECK-BASELINE-NEXT: xorw %r15w, %r12w
+; CHECK-BASELINE-NEXT: xorw %bx, %r12w
; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %r15d
-; CHECK-BASELINE-NEXT: xorw %r14w, %r15w
+; CHECK-BASELINE-NEXT: xorw %r11w, %r15w
; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %r14d
-; CHECK-BASELINE-NEXT: xorw %r11w, %r14w
+; CHECK-BASELINE-NEXT: xorw %r8w, %r14w
; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %ebp
; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload
; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %ebx
@@ -3881,12 +3868,12 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: andw 14(%r9), %r13w
; CHECK-BASELINE-NEXT: andw 12(%r9), %dx
; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: andw 10(%r9), %r8w
-; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movl %ecx, %edx
-; CHECK-BASELINE-NEXT: andw 8(%r9), %dx
-; CHECK-BASELINE-NEXT: andw 6(%r9), %ax
+; CHECK-BASELINE-NEXT: andw 10(%r9), %ax
; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-BASELINE-NEXT: andw 8(%r9), %dx
+; CHECK-BASELINE-NEXT: andw 6(%r9), %cx
+; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
; CHECK-BASELINE-NEXT: andw 4(%r9), %r8w
; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
@@ -3962,22 +3949,22 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT: movzwl 22(%rdx), %eax
; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movl 20(%rdx), %r11d
+; CHECK-SSE1-NEXT: movl 20(%rdx), %r8d
+; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r11d
; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r14d
+; CHECK-SSE1-NEXT: movl 16(%rdx), %ebx
+; CHECK-SSE1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: movzwl 14(%rdx), %ebp
+; CHECK-SSE1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: movl 12(%rdx), %r14d
; CHECK-SSE1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movl 16(%rdx), %r15d
+; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r15d
; CHECK-SSE1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movzwl 14(%rdx), %r12d
+; CHECK-SSE1-NEXT: movl 8(%rdx), %r12d
; CHECK-SSE1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movl 12(%rdx), %r13d
+; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r13d
; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r8d
-; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movl 8(%rdx), %ebx
-; CHECK-SSE1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movzwl 6(%rdx), %ebp
-; CHECK-SSE1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT: movl (%rdx), %ecx
; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT: movl 4(%rdx), %edi
@@ -3993,24 +3980,23 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax
; CHECK-SSE1-NEXT: xorw %di, %ax
; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movzwl 6(%rsi), %edx
-; CHECK-SSE1-NEXT: xorw %bp, %dx
-; CHECK-SSE1-NEXT: movl %edx, %eax
-; CHECK-SSE1-NEXT: movzwl 8(%rsi), %ecx
-; CHECK-SSE1-NEXT: xorw %bx, %cx
-; CHECK-SSE1-NEXT: movzwl 10(%rsi), %edx
-; CHECK-SSE1-NEXT: xorw %r8w, %dx
-; CHECK-SSE1-NEXT: movl %edx, %r8d
+; CHECK-SSE1-NEXT: movzwl 6(%rsi), %ecx
+; CHECK-SSE1-NEXT: xorw %r13w, %cx
+; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax
+; CHECK-SSE1-NEXT: xorw %r12w, %ax
+; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax
+; CHECK-SSE1-NEXT: xorw %r15w, %ax
; CHECK-SSE1-NEXT: movzwl 12(%rsi), %edx
-; CHECK-SSE1-NEXT: xorw %r13w, %dx
+; CHECK-SSE1-NEXT: xorw %r14w, %dx
; CHECK-SSE1-NEXT: movzwl 14(%rsi), %r13d
-; CHECK-SSE1-NEXT: xorw %r12w, %r13w
+; CHECK-SSE1-NEXT: xorw %bp, %r13w
; CHECK-SSE1-NEXT: movzwl 16(%rsi), %r12d
-; CHECK-SSE1-NEXT: xorw %r15w, %r12w
+; CHECK-SSE1-NEXT: xorw %bx, %r12w
; CHECK-SSE1-NEXT: movzwl 18(%rsi), %r15d
-; CHECK-SSE1-NEXT: xorw %r14w, %r15w
+; CHECK-SSE1-NEXT: xorw %r11w, %r15w
; CHECK-SSE1-NEXT: movzwl 20(%rsi), %r14d
-; CHECK-SSE1-NEXT: xorw %r11w, %r14w
+; CHECK-SSE1-NEXT: xorw %r8w, %r14w
; CHECK-SSE1-NEXT: movzwl 22(%rsi), %ebp
; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload
; CHECK-SSE1-NEXT: movzwl 24(%rsi), %ebx
@@ -4032,12 +4018,12 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: andw 14(%r9), %r13w
; CHECK-SSE1-NEXT: andw 12(%r9), %dx
; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: andw 10(%r9), %r8w
-; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movl %ecx, %edx
-; CHECK-SSE1-NEXT: andw 8(%r9), %dx
-; CHECK-SSE1-NEXT: andw 6(%r9), %ax
+; CHECK-SSE1-NEXT: andw 10(%r9), %ax
; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-SSE1-NEXT: andw 8(%r9), %dx
+; CHECK-SSE1-NEXT: andw 6(%r9), %cx
+; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
; CHECK-SSE1-NEXT: andw 4(%r9), %r8w
; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
@@ -4131,57 +4117,57 @@ define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: pushq %r13
; CHECK-BASELINE-NEXT: pushq %r12
; CHECK-BASELINE-NEXT: pushq %rbx
-; CHECK-BASELINE-NEXT: movl 28(%rdx), %r15d
-; CHECK-BASELINE-NEXT: movl 24(%rdx), %r14d
+; CHECK-BASELINE-NEXT: movl 28(%rdx), %ebp
+; CHECK-BASELINE-NEXT: movl 24(%rdx), %ebx
; CHECK-BASELINE-NEXT: movl 20(%rdx), %r10d
; CHECK-BASELINE-NEXT: movl 16(%rdx), %eax
; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movl 12(%rdx), %ebp
-; CHECK-BASELINE-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebx
-; CHECK-BASELINE-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT: movl (%rdx), %r12d
+; CHECK-BASELINE-NEXT: movl 12(%rdx), %r12d
+; CHECK-BASELINE-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: movl 8(%rdx), %r14d
+; CHECK-BASELINE-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT: movl (%rdx), %r15d
; CHECK-BASELINE-NEXT: movl 4(%rdx), %r13d
-; CHECK-BASELINE-NEXT: movl (%rsi), %r11d
-; CHECK-BASELINE-NEXT: xorl %r12d, %r11d
+; CHECK-BASELINE-NEXT: movl (%rsi), %r8d
+; CHECK-BASELINE-NEXT: xorl %r15d, %r8d
; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d
; CHECK-BASELINE-NEXT: xorl %r13d, %r9d
-; CHECK-BASELINE-NEXT: movl 8(%rsi), %r8d
-; CHECK-BASELINE-NEXT: xorl %ebx, %r8d
-; CHECK-BASELINE-NEXT: movl 12(%rsi), %ebx
-; CHECK-BASELINE-NEXT: xorl %ebp, %ebx
-; CHECK-BASELINE-NEXT: movl 16(%rsi), %ebp
-; CHECK-BASELINE-NEXT: xorl %eax, %ebp
+; CHECK-BASELINE-NEXT: movl 8(%rsi), %r11d
+; CHECK-BASELINE-NEXT: xorl %r14d, %r11d
+; CHECK-BASELINE-NEXT: movl 12(%rsi), %r14d
+; CHECK-BASELINE-NEXT: xorl %r12d, %r14d
+; CHECK-BASELINE-NEXT: movl 16(%rsi), %r12d
+; CHECK-BASELINE-NEXT: xorl %eax, %r12d
; CHECK-BASELINE-NEXT: movl 20(%rsi), %edx
; CHECK-BASELINE-NEXT: xorl %r10d, %edx
; CHECK-BASELINE-NEXT: movl 24(%rsi), %eax
-; CHECK-BASELINE-NEXT: xorl %r14d, %eax
+; CHECK-BASELINE-NEXT: xorl %ebx, %eax
; CHECK-BASELINE-NEXT: movl 28(%rsi), %esi
-; CHECK-BASELINE-NEXT: xorl %r15d, %esi
+; CHECK-BASELINE-NEXT: xorl %ebp, %esi
; CHECK-BASELINE-NEXT: andl 28(%rcx), %esi
; CHECK-BASELINE-NEXT: andl 24(%rcx), %eax
; CHECK-BASELINE-NEXT: andl 20(%rcx), %edx
-; CHECK-BASELINE-NEXT: andl 16(%rcx), %ebp
-; CHECK-BASELINE-NEXT: andl 12(%rcx), %ebx
-; CHECK-BASELINE-NEXT: andl 8(%rcx), %r8d
+; CHECK-BASELINE-NEXT: andl 16(%rcx), %r12d
+; CHECK-BASELINE-NEXT: andl 12(%rcx), %r14d
+; CHECK-BASELINE-NEXT: andl 8(%rcx), %r11d
; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d
-; CHECK-BASELINE-NEXT: andl (%rcx), %r11d
-; CHECK-BASELINE-NEXT: xorl %r12d, %r11d
+; CHECK-BASELINE-NEXT: andl (%rcx), %r8d
+; CHECK-BASELINE-NEXT: xorl %r15d, %r8d
; CHECK-BASELINE-NEXT: xorl %r13d, %r9d
-; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
; CHECK-BASELINE-NEXT: xorl %r10d, %edx
-; CHECK-BASELINE-NEXT: xorl %r14d, %eax
-; CHECK-BASELINE-NEXT: xorl %r15d, %esi
+; CHECK-BASELINE-NEXT: xorl %ebx, %eax
+; CHECK-BASELINE-NEXT: xorl %ebp, %esi
; CHECK-BASELINE-NEXT: movl %esi, 28(%rdi)
; CHECK-BASELINE-NEXT: movl %eax, 24(%rdi)
; CHECK-BASELINE-NEXT: movl %edx, 20(%rdi)
-; CHECK-BASELINE-NEXT: movl %ebp, 16(%rdi)
-; CHECK-BASELINE-NEXT: movl %ebx, 12(%rdi)
-; CHECK-BASELINE-NEXT: movl %r8d, 8(%rdi)
+; CHECK-BASELINE-NEXT: movl %r12d, 16(%rdi)
+; CHECK-BASELINE-NEXT: movl %r14d, 12(%rdi)
+; CHECK-BASELINE-NEXT: movl %r11d, 8(%rdi)
; CHECK-BASELINE-NEXT: movl %r9d, 4(%rdi)
-; CHECK-BASELINE-NEXT: movl %r11d, (%rdi)
+; CHECK-BASELINE-NEXT: movl %r8d, (%rdi)
; CHECK-BASELINE-NEXT: movq %rdi, %rax
; CHECK-BASELINE-NEXT: popq %rbx
; CHECK-BASELINE-NEXT: popq %r12
@@ -4199,57 +4185,57 @@ define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: pushq %r13
; CHECK-SSE1-NEXT: pushq %r12
; CHECK-SSE1-NEXT: pushq %rbx
-; CHECK-SSE1-NEXT: movl 28(%rdx), %r15d
-; CHECK-SSE1-NEXT: movl 24(%rdx), %r14d
+; CHECK-SSE1-NEXT: movl 28(%rdx), %ebp
+; CHECK-SSE1-NEXT: movl 24(%rdx), %ebx
; CHECK-SSE1-NEXT: movl 20(%rdx), %r10d
; CHECK-SSE1-NEXT: movl 16(%rdx), %eax
; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movl 12(%rdx), %ebp
-; CHECK-SSE1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movl 8(%rdx), %ebx
-; CHECK-SSE1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT: movl (%rdx), %r12d
+; CHECK-SSE1-NEXT: movl 12(%rdx), %r12d
+; CHECK-SSE1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: movl 8(%rdx), %r14d
+; CHECK-SSE1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT: movl (%rdx), %r15d
; CHECK-SSE1-NEXT: movl 4(%rdx), %r13d
-; CHECK-SSE1-NEXT: movl (%rsi), %r11d
-; CHECK-SSE1-NEXT: xorl %r12d, %r11d
+; CHECK-SSE1-NEXT: movl (%rsi), %r8d
+; CHECK-SSE1-NEXT: xorl %r15d, %r8d
; CHECK-SSE1-NEXT: movl 4(%rsi), %r9d
; CHECK-SSE1-NEXT: xorl %r13d, %r9d
-; CHECK-SSE1-NEXT: movl 8(%rsi), %r8d
-; CHECK-SSE1-NEXT: xorl %ebx, %r8d
-; CHECK-SSE1-NEXT: movl 12(%rsi), %ebx
-; CHECK-SSE1-NEXT: xorl %ebp, %ebx
-; CHECK-SSE1-NEXT: movl 16(%rsi), %ebp
-; CHECK-SSE1-NEXT: xorl %eax, %ebp
+; CHECK-SSE1-NEXT: movl 8(%rsi), %r11d
+; CHECK-SSE1-NEXT: xorl %r14d, %r11d
+; CHECK-SSE1-NEXT: movl 12(%rsi), %r14d
+; CHECK-SSE1-NEXT: xorl %r12d, %r14d
+; CHECK-SSE1-NEXT: movl 16(%rsi), %r12d
+; CHECK-SSE1-NEXT: xorl %eax, %r12d
; CHECK-SSE1-NEXT: movl 20(%rsi), %edx
; CHECK-SSE1-NEXT: xorl %r10d, %edx
; CHECK-SSE1-NEXT: movl 24(%rsi), %eax
-; CHECK-SSE1-NEXT: xorl %r14d, %eax
+; CHECK-SSE1-NEXT: xorl %ebx, %eax
; CHECK-SSE1-NEXT: movl 28(%rsi), %esi
-; CHECK-SSE1-NEXT: xorl %r15d, %esi
+; CHECK-SSE1-NEXT: xorl %ebp, %esi
; CHECK-SSE1-NEXT: andl 28(%rcx), %esi
; CHECK-SSE1-NEXT: andl 24(%rcx), %eax
; CHECK-SSE1-NEXT: andl 20(%rcx), %edx
-; CHECK-SSE1-NEXT: andl 16(%rcx), %ebp
-; CHECK-SSE1-NEXT: andl 12(%rcx), %ebx
-; CHECK-SSE1-NEXT: andl 8(%rcx), %r8d
+; CHECK-SSE1-NEXT: andl 16(%rcx), %r12d
+; CHECK-SSE1-NEXT: andl 12(%rcx), %r14d
+; CHECK-SSE1-NEXT: andl 8(%rcx), %r11d
; CHECK-SSE1-NEXT: andl 4(%rcx), %r9d
-; CHECK-SSE1-NEXT: andl (%rcx), %r11d
-; CHECK-SSE1-NEXT: xorl %r12d, %r11d
+; CHECK-SSE1-NEXT: andl (%rcx), %r8d
+; CHECK-SSE1-NEXT: xorl %r15d, %r8d
; CHECK-SSE1-NEXT: xorl %r13d, %r9d
-; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
-; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
-; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
+; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
; CHECK-SSE1-NEXT: xorl %r10d, %edx
-; CHECK-SSE1-NEXT: xorl %r14d, %eax
-; CHECK-SSE1-NEXT: xorl %r15d, %esi
+; CHECK-SSE1-NEXT: xorl %ebx, %eax
+; CHECK-SSE1-NEXT: xorl %ebp, %esi
; CHECK-SSE1-NEXT: movl %esi, 28(%rdi)
; CHECK-SSE1-NEXT: movl %eax, 24(%rdi)
; CHECK-SSE1-NEXT: movl %edx, 20(%rdi)
-; CHECK-SSE1-NEXT: movl %ebp, 16(%rdi)
-; CHECK-SSE1-NEXT: movl %ebx, 12(%rdi)
-; CHECK-SSE1-NEXT: movl %r8d, 8(%rdi)
+; CHECK-SSE1-NEXT: movl %r12d, 16(%rdi)
+; CHECK-SSE1-NEXT: movl %r14d, 12(%rdi)
+; CHECK-SSE1-NEXT: movl %r11d, 8(%rdi)
; CHECK-SSE1-NEXT: movl %r9d, 4(%rdi)
-; CHECK-SSE1-NEXT: movl %r11d, (%rdi)
+; CHECK-SSE1-NEXT: movl %r8d, (%rdi)
; CHECK-SSE1-NEXT: movq %rdi, %rax
; CHECK-SSE1-NEXT: popq %rbx
; CHECK-SSE1-NEXT: popq %r12
@@ -4293,29 +4279,29 @@ define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE: # %bb.0:
; CHECK-BASELINE-NEXT: pushq %rbx
; CHECK-BASELINE-NEXT: movq %rdi, %rax
-; CHECK-BASELINE-NEXT: movq 24(%rdx), %r8
-; CHECK-BASELINE-NEXT: movq 16(%rdx), %r9
-; CHECK-BASELINE-NEXT: movq (%rdx), %r11
+; CHECK-BASELINE-NEXT: movq 24(%rdx), %rdi
+; CHECK-BASELINE-NEXT: movq 16(%rdx), %r8
+; CHECK-BASELINE-NEXT: movq (%rdx), %r9
; CHECK-BASELINE-NEXT: movq 8(%rdx), %r10
; CHECK-BASELINE-NEXT: movq (%rsi), %rdx
-; CHECK-BASELINE-NEXT: xorq %r11, %rdx
-; CHECK-BASELINE-NEXT: movq 8(%rsi), %rdi
-; CHECK-BASELINE-NEXT: xorq %r10, %rdi
+; CHECK-BASELINE-NEXT: xorq %r9, %rdx
+; CHECK-BASELINE-NEXT: movq 8(%rsi), %r11
+; CHECK-BASELINE-NEXT: xorq %r10, %r11
; CHECK-BASELINE-NEXT: movq 16(%rsi), %rbx
-; CHECK-BASELINE-NEXT: xorq %r9, %rbx
+; CHECK-BASELINE-NEXT: xorq %r8, %rbx
; CHECK-BASELINE-NEXT: movq 24(%rsi), %rsi
-; CHECK-BASELINE-NEXT: xorq %r8, %rsi
+; CHECK-BASELINE-NEXT: xorq %rdi, %rsi
; CHECK-BASELINE-NEXT: andq 24(%rcx), %rsi
; CHECK-BASELINE-NEXT: andq 16(%rcx), %rbx
-; CHECK-BASELINE-NEXT: andq 8(%rcx), %rdi
+; CHECK-BASELINE-NEXT: andq 8(%rcx), %r11
; CHECK-BASELINE-NEXT: andq (%rcx), %rdx
-; CHECK-BASELINE-NEXT: xorq %r11, %rdx
-; CHECK-BASELINE-NEXT: xorq %r10, %rdi
-; CHECK-BASELINE-NEXT: xorq %r9, %rbx
-; CHECK-BASELINE-NEXT: xorq %r8, %rsi
+; CHECK-BASELINE-NEXT: xorq %r9, %rdx
+; CHECK-BASELINE-NEXT: xorq %r10, %r11
+; CHECK-BASELINE-NEXT: xorq %r8, %rbx
+; CHECK-BASELINE-NEXT: xorq %rdi, %rsi
; CHECK-BASELINE-NEXT: movq %rsi, 24(%rax)
; CHECK-BASELINE-NEXT: movq %rbx, 16(%rax)
-; CHECK-BASELINE-NEXT: movq %rdi, 8(%rax)
+; CHECK-BASELINE-NEXT: movq %r11, 8(%rax)
; CHECK-BASELINE-NEXT: movq %rdx, (%rax)
; CHECK-BASELINE-NEXT: popq %rbx
; CHECK-BASELINE-NEXT: retq
@@ -4324,29 +4310,29 @@ define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1: # %bb.0:
; CHECK-SSE1-NEXT: pushq %rbx
; CHECK-SSE1-NEXT: movq %rdi, %rax
-; CHECK-SSE1-NEXT: movq 24(%rdx), %r8
-; CHECK-SSE1-NEXT: movq 16(%rdx), %r9
-; CHECK-SSE1-NEXT: movq (%rdx), %r11
+; CHECK-SSE1-NEXT: movq 24(%rdx), %rdi
+; CHECK-SSE1-NEXT: movq 16(%rdx), %r8
+; CHECK-SSE1-NEXT: movq (%rdx), %r9
; CHECK-SSE1-NEXT: movq 8(%rdx), %r10
; CHECK-SSE1-NEXT: movq (%rsi), %rdx
-; CHECK-SSE1-NEXT: xorq %r11, %rdx
-; CHECK-SSE1-NEXT: movq 8(%rsi), %rdi
-; CHECK-SSE1-NEXT: xorq %r10, %rdi
+; CHECK-SSE1-NEXT: xorq %r9, %rdx
+; CHECK-SSE1-NEXT: movq 8(%rsi), %r11
+; CHECK-SSE1-NEXT: xorq %r10, %r11
; CHECK-SSE1-NEXT: movq 16(%rsi), %rbx
-; CHECK-SSE1-NEXT: xorq %r9, %rbx
+; CHECK-SSE1-NEXT: xorq %r8, %rbx
; CHECK-SSE1-NEXT: movq 24(%rsi), %rsi
-; CHECK-SSE1-NEXT: xorq %r8, %rsi
+; CHECK-SSE1-NEXT: xorq %rdi, %rsi
; CHECK-SSE1-NEXT: andq 24(%rcx), %rsi
; CHECK-SSE1-NEXT: andq 16(%rcx), %rbx
-; CHECK-SSE1-NEXT: andq 8(%rcx), %rdi
+; CHECK-SSE1-NEXT: andq 8(%rcx), %r11
; CHECK-SSE1-NEXT: andq (%rcx), %rdx
-; CHECK-SSE1-NEXT: xorq %r11, %rdx
-; CHECK-SSE1-NEXT: xorq %r10, %rdi
-; CHECK-SSE1-NEXT: xorq %r9, %rbx
-; CHECK-SSE1-NEXT: xorq %r8, %rsi
+; CHECK-SSE1-NEXT: xorq %r9, %rdx
+; CHECK-SSE1-NEXT: xorq %r10, %r11
+; CHECK-SSE1-NEXT: xorq %r8, %rbx
+; CHECK-SSE1-NEXT: xorq %rdi, %rsi
; CHECK-SSE1-NEXT: movq %rsi, 24(%rax)
; CHECK-SSE1-NEXT: movq %rbx, 16(%rax)
-; CHECK-SSE1-NEXT: movq %rdi, 8(%rax)
+; CHECK-SSE1-NEXT: movq %r11, 8(%rax)
; CHECK-SSE1-NEXT: movq %rdx, (%rax)
; CHECK-SSE1-NEXT: popq %rbx
; CHECK-SSE1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll
index 6a49f74f1ddbd..5cdc516cb4337 100644
--- a/llvm/test/CodeGen/X86/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll
@@ -932,24 +932,24 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE-NEXT: pcmpgtd %xmm4, %xmm10
; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm4, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE-NEXT: pand %xmm11, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE-NEXT: por %xmm9, %xmm4
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE-NEXT: pand %xmm11, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE-NEXT: por %xmm4, %xmm9
+; SSE-NEXT: pand %xmm9, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: psubq %xmm5, %xmm1
; SSE-NEXT: pxor %xmm8, %xmm5
-; SSE-NEXT: pxor %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm9, %xmm4
-; SSE-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm5, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
-; SSE-NEXT: pand %xmm10, %xmm5
+; SSE-NEXT: pxor %xmm8, %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm9
+; SSE-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm5, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE-NEXT: por %xmm5, %xmm4
-; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm10, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
+; SSE-NEXT: por %xmm4, %xmm5
+; SSE-NEXT: pand %xmm5, %xmm1
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: psubq %xmm6, %xmm2
; SSE-NEXT: pxor %xmm8, %xmm6
@@ -982,10 +982,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm8
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm6
+; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpsubq %xmm4, %xmm7, %xmm4
; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 3f1c13e08b7fa..eebb2c6f95368 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -129,42 +129,42 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
; SSE3-LABEL: var_shuffle_v8i16:
; SSE3: # %bb.0:
-; SSE3-NEXT: movd %xmm1, %r8d
-; SSE3-NEXT: pextrw $1, %xmm1, %r9d
-; SSE3-NEXT: pextrw $2, %xmm1, %r10d
+; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: pextrw $1, %xmm1, %ecx
+; SSE3-NEXT: pextrw $2, %xmm1, %edx
; SSE3-NEXT: pextrw $3, %xmm1, %esi
; SSE3-NEXT: pextrw $4, %xmm1, %edi
-; SSE3-NEXT: pextrw $5, %xmm1, %eax
-; SSE3-NEXT: pextrw $6, %xmm1, %ecx
-; SSE3-NEXT: pextrw $7, %xmm1, %edx
+; SSE3-NEXT: pextrw $5, %xmm1, %r8d
+; SSE3-NEXT: pextrw $6, %xmm1, %r9d
+; SSE3-NEXT: pextrw $7, %xmm1, %r10d
; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT: andl $7, %r8d
-; SSE3-NEXT: andl $7, %r9d
-; SSE3-NEXT: andl $7, %r10d
-; SSE3-NEXT: andl $7, %esi
-; SSE3-NEXT: andl $7, %edi
; SSE3-NEXT: andl $7, %eax
; SSE3-NEXT: andl $7, %ecx
; SSE3-NEXT: andl $7, %edx
-; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx
-; SSE3-NEXT: movd %edx, %xmm0
-; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
-; SSE3-NEXT: movd %ecx, %xmm1
+; SSE3-NEXT: andl $7, %esi
+; SSE3-NEXT: andl $7, %edi
+; SSE3-NEXT: andl $7, %r8d
+; SSE3-NEXT: andl $7, %r9d
+; SSE3-NEXT: andl $7, %r10d
+; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d
+; SSE3-NEXT: movd %r10d, %xmm0
+; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d
+; SSE3-NEXT: movd %r9d, %xmm1
; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
-; SSE3-NEXT: movd %eax, %xmm0
-; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %eax
-; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d
+; SSE3-NEXT: movd %r8d, %xmm0
+; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi
+; SSE3-NEXT: movd %edi, %xmm2
; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax
-; SSE3-NEXT: movd %eax, %xmm0
-; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax
-; SSE3-NEXT: movd %eax, %xmm1
+; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; SSE3-NEXT: movd %esi, %xmm0
+; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx
+; SSE3-NEXT: movd %edx, %xmm1
; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax
-; SSE3-NEXT: movd %eax, %xmm3
-; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax
+; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSE3-NEXT: movd %ecx, %xmm3
+; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -231,15 +231,15 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm8
+; SSE3-NEXT: movd %eax, %xmm1
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm15
+; SSE3-NEXT: movd %eax, %xmm2
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm9
+; SSE3-NEXT: movd %eax, %xmm4
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
@@ -247,7 +247,7 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm10
+; SSE3-NEXT: movd %eax, %xmm5
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
@@ -255,7 +255,7 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm11
+; SSE3-NEXT: movd %eax, %xmm8
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
@@ -263,49 +263,49 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm12
+; SSE3-NEXT: movd %eax, %xmm9
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm5
+; SSE3-NEXT: movd %eax, %xmm10
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm13
+; SSE3-NEXT: movd %eax, %xmm11
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm4
+; SSE3-NEXT: movd %eax, %xmm12
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm14
+; SSE3-NEXT: movd %eax, %xmm13
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm1
+; SSE3-NEXT: movd %eax, %xmm14
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: movd %eax, %xmm15
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm0
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSE3-NEXT: retq
;
@@ -495,15 +495,15 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm8
+; SSE3-NEXT: movd %eax, %xmm1
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm15
+; SSE3-NEXT: movd %eax, %xmm2
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm9
+; SSE3-NEXT: movd %eax, %xmm4
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
@@ -511,7 +511,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm10
+; SSE3-NEXT: movd %eax, %xmm5
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
@@ -519,7 +519,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm11
+; SSE3-NEXT: movd %eax, %xmm8
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
@@ -527,49 +527,49 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm12
+; SSE3-NEXT: movd %eax, %xmm9
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm5
+; SSE3-NEXT: movd %eax, %xmm10
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm13
+; SSE3-NEXT: movd %eax, %xmm11
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm4
+; SSE3-NEXT: movd %eax, %xmm12
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm14
+; SSE3-NEXT: movd %eax, %xmm13
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm1
+; SSE3-NEXT: movd %eax, %xmm14
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: movd %eax, %xmm15
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm0
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSE3-NEXT: retq
;
@@ -656,114 +656,112 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, (%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSE3-NEXT: andl $31, %r8d
-; SSE3-NEXT: movzbl -96(%rsp,%r8), %esi
-; SSE3-NEXT: movd %esi, %xmm8
-; SSE3-NEXT: andl $31, %ebp
-; SSE3-NEXT: movzbl -64(%rsp,%rbp), %esi
-; SSE3-NEXT: movd %esi, %xmm15
-; SSE3-NEXT: andl $31, %edx
-; SSE3-NEXT: movzbl -32(%rsp,%rdx), %edx
-; SSE3-NEXT: movd %edx, %xmm9
-; SSE3-NEXT: andl $31, %ecx
-; SSE3-NEXT: movzbl (%rsp,%rcx), %ecx
-; SSE3-NEXT: movd %ecx, %xmm3
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: andl $31, %eax
-; SSE3-NEXT: movzbl 32(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm10
-; SSE3-NEXT: andl $31, %edi
-; SSE3-NEXT: movzbl 64(%rsp,%rdi), %eax
+; SSE3-NEXT: movzbl -96(%rsp,%rax), %eax
+; SSE3-NEXT: movd %eax, %xmm1
+; SSE3-NEXT: andl $31, %ebp
+; SSE3-NEXT: movzbl -64(%rsp,%rbp), %eax
+; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: andl $31, %r13d
+; SSE3-NEXT: movzbl -32(%rsp,%r13), %eax
+; SSE3-NEXT: movd %eax, %xmm4
+; SSE3-NEXT: andl $31, %r12d
+; SSE3-NEXT: movzbl (%rsp,%r12), %eax
+; SSE3-NEXT: movd %eax, %xmm3
+; SSE3-NEXT: andl $31, %r15d
+; SSE3-NEXT: movzbl 32(%rsp,%r15), %eax
+; SSE3-NEXT: movd %eax, %xmm5
+; SSE3-NEXT: andl $31, %r14d
+; SSE3-NEXT: movzbl 64(%rsp,%r14), %eax
; SSE3-NEXT: movd %eax, %xmm7
; SSE3-NEXT: andl $31, %ebx
; SSE3-NEXT: movzbl 96(%rsp,%rbx), %eax
-; SSE3-NEXT: movd %eax, %xmm11
-; SSE3-NEXT: andl $31, %r9d
-; SSE3-NEXT: movzbl 128(%rsp,%r9), %eax
+; SSE3-NEXT: movd %eax, %xmm8
+; SSE3-NEXT: andl $31, %r11d
+; SSE3-NEXT: movzbl 128(%rsp,%r11), %eax
; SSE3-NEXT: movd %eax, %xmm6
-; SSE3-NEXT: andl $31, %r13d
-; SSE3-NEXT: movzbl 160(%rsp,%r13), %eax
+; SSE3-NEXT: andl $31, %r10d
+; SSE3-NEXT: movzbl 160(%rsp,%r10), %eax
+; SSE3-NEXT: movd %eax, %xmm9
+; SSE3-NEXT: andl $31, %r9d
+; SSE3-NEXT: movzbl 192(%rsp,%r9), %eax
+; SSE3-NEXT: movd %eax, %xmm10
+; SSE3-NEXT: andl $31, %r8d
+; SSE3-NEXT: movzbl 224(%rsp,%r8), %eax
+; SSE3-NEXT: movd %eax, %xmm11
+; SSE3-NEXT: andl $31, %edi
+; SSE3-NEXT: movzbl 256(%rsp,%rdi), %eax
; SSE3-NEXT: movd %eax, %xmm12
-; SSE3-NEXT: andl $31, %r12d
-; SSE3-NEXT: movzbl 192(%rsp,%r12), %eax
-; SSE3-NEXT: movd %eax, %xmm5
-; SSE3-NEXT: andl $31, %r15d
-; SSE3-NEXT: movzbl 224(%rsp,%r15), %eax
+; SSE3-NEXT: andl $31, %esi
+; SSE3-NEXT: movzbl 288(%rsp,%rsi), %eax
; SSE3-NEXT: movd %eax, %xmm13
-; SSE3-NEXT: andl $31, %r14d
-; SSE3-NEXT: movzbl 256(%rsp,%r14), %eax
-; SSE3-NEXT: movd %eax, %xmm4
-; SSE3-NEXT: andl $31, %r11d
-; SSE3-NEXT: movzbl 288(%rsp,%r11), %eax
+; SSE3-NEXT: andl $31, %edx
+; SSE3-NEXT: movzbl 320(%rsp,%rdx), %eax
; SSE3-NEXT: movd %eax, %xmm14
-; SSE3-NEXT: andl $31, %r10d
-; SSE3-NEXT: movzbl 320(%rsp,%r10), %eax
-; SSE3-NEXT: movd %eax, %xmm1
-; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE3-NEXT: andl $31, %eax
-; SSE3-NEXT: movzbl 352(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: andl $31, %ecx
+; SSE3-NEXT: movzbl 352(%rsp,%rcx), %eax
+; SSE3-NEXT: movd %eax, %xmm15
; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE3-NEXT: andl $31, %eax
; SSE3-NEXT: movzbl 384(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm0
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSE3-NEXT: addq $424, %rsp # imm = 0x1A8
; SSE3-NEXT: popq %rbx
@@ -790,114 +788,112 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, (%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSSE3-NEXT: andl $31, %r8d
-; SSSE3-NEXT: movzbl -96(%rsp,%r8), %esi
-; SSSE3-NEXT: movd %esi, %xmm8
-; SSSE3-NEXT: andl $31, %ebp
-; SSSE3-NEXT: movzbl -64(%rsp,%rbp), %esi
-; SSSE3-NEXT: movd %esi, %xmm15
-; SSSE3-NEXT: andl $31, %edx
-; SSSE3-NEXT: movzbl -32(%rsp,%rdx), %edx
-; SSSE3-NEXT: movd %edx, %xmm9
-; SSSE3-NEXT: andl $31, %ecx
-; SSSE3-NEXT: movzbl (%rsp,%rcx), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $31, %eax
-; SSSE3-NEXT: movzbl 32(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm10
-; SSSE3-NEXT: andl $31, %edi
-; SSSE3-NEXT: movzbl 64(%rsp,%rdi), %eax
+; SSSE3-NEXT: movzbl -96(%rsp,%rax), %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: andl $31, %ebp
+; SSSE3-NEXT: movzbl -64(%rsp,%rbp), %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: andl $31, %r13d
+; SSSE3-NEXT: movzbl -32(%rsp,%r13), %eax
+; SSSE3-NEXT: movd %eax, %xmm4
+; SSSE3-NEXT: andl $31, %r12d
+; SSSE3-NEXT: movzbl (%rsp,%r12), %eax
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: andl $31, %r15d
+; SSSE3-NEXT: movzbl 32(%rsp,%r15), %eax
+; SSSE3-NEXT: movd %eax, %xmm5
+; SSSE3-NEXT: andl $31, %r14d
+; SSSE3-NEXT: movzbl 64(%rsp,%r14), %eax
; SSSE3-NEXT: movd %eax, %xmm7
; SSSE3-NEXT: andl $31, %ebx
; SSSE3-NEXT: movzbl 96(%rsp,%rbx), %eax
-; SSSE3-NEXT: movd %eax, %xmm11
-; SSSE3-NEXT: andl $31, %r9d
-; SSSE3-NEXT: movzbl 128(%rsp,%r9), %eax
+; SSSE3-NEXT: movd %eax, %xmm8
+; SSSE3-NEXT: andl $31, %r11d
+; SSSE3-NEXT: movzbl 128(%rsp,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm6
-; SSSE3-NEXT: andl $31, %r13d
-; SSSE3-NEXT: movzbl 160(%rsp,%r13), %eax
+; SSSE3-NEXT: andl $31, %r10d
+; SSSE3-NEXT: movzbl 160(%rsp,%r10), %eax
+; SSSE3-NEXT: movd %eax, %xmm9
+; SSSE3-NEXT: andl $31, %r9d
+; SSSE3-NEXT: movzbl 192(%rsp,%r9), %eax
+; SSSE3-NEXT: movd %eax, %xmm10
+; SSSE3-NEXT: andl $31, %r8d
+; SSSE3-NEXT: movzbl 224(%rsp,%r8), %eax
+; SSSE3-NEXT: movd %eax, %xmm11
+; SSSE3-NEXT: andl $31, %edi
+; SSSE3-NEXT: movzbl 256(%rsp,%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm12
-; SSSE3-NEXT: andl $31, %r12d
-; SSSE3-NEXT: movzbl 192(%rsp,%r12), %eax
-; SSSE3-NEXT: movd %eax, %xmm5
-; SSSE3-NEXT: andl $31, %r15d
-; SSSE3-NEXT: movzbl 224(%rsp,%r15), %eax
+; SSSE3-NEXT: andl $31, %esi
+; SSSE3-NEXT: movzbl 288(%rsp,%rsi), %eax
; SSSE3-NEXT: movd %eax, %xmm13
-; SSSE3-NEXT: andl $31, %r14d
-; SSSE3-NEXT: movzbl 256(%rsp,%r14), %eax
-; SSSE3-NEXT: movd %eax, %xmm4
-; SSSE3-NEXT: andl $31, %r11d
-; SSSE3-NEXT: movzbl 288(%rsp,%r11), %eax
+; SSSE3-NEXT: andl $31, %edx
+; SSSE3-NEXT: movzbl 320(%rsp,%rdx), %eax
; SSSE3-NEXT: movd %eax, %xmm14
-; SSSE3-NEXT: andl $31, %r10d
-; SSSE3-NEXT: movzbl 320(%rsp,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSSE3-NEXT: andl $31, %eax
-; SSSE3-NEXT: movzbl 352(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: andl $31, %ecx
+; SSSE3-NEXT: movzbl 352(%rsp,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm15
; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSSE3-NEXT: andl $31, %eax
; SSSE3-NEXT: movzbl 384(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSSE3-NEXT: addq $424, %rsp # imm = 0x1A8
; SSSE3-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll
index fb9da199bae5a..c512448ab7db2 100644
--- a/llvm/test/CodeGen/X86/var-permute-512.ll
+++ b/llvm/test/CodeGen/X86/var-permute-512.ll
@@ -1122,7 +1122,7 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b)
; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vpextrd $3, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm8
+; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm5, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
@@ -1213,98 +1213,98 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b)
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7
; AVX512F-NEXT: vpextrd $3, %xmm2, %eax
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm8
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7
-; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vmovd %xmm8, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7
-; AVX512F-NEXT: vpextrd $1, %xmm0, %eax
+; AVX512F-NEXT: vpextrd $1, %xmm8, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7
-; AVX512F-NEXT: vpextrd $2, %xmm0, %eax
+; AVX512F-NEXT: vpextrd $2, %xmm8, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7
-; AVX512F-NEXT: vpextrd $3, %xmm0, %eax
-; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm0
+; AVX512F-NEXT: vpextrd $3, %xmm8, %eax
+; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm8
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7
-; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vmovd %xmm8, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7
-; AVX512F-NEXT: vpextrd $1, %xmm0, %eax
+; AVX512F-NEXT: vpextrd $1, %xmm8, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7
-; AVX512F-NEXT: vpextrd $2, %xmm0, %eax
+; AVX512F-NEXT: vpextrd $2, %xmm8, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7
-; AVX512F-NEXT: vpextrd $3, %xmm0, %eax
-; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm0
+; AVX512F-NEXT: vpextrd $3, %xmm8, %eax
+; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm2
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm7, %xmm2
-; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7
+; AVX512F-NEXT: vmovd %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: vpextrd $1, %xmm0, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $1, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: vpextrd $2, %xmm0, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $2, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: vpextrd $3, %xmm0, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $3, %xmm2, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2
; AVX512F-NEXT: vpextrd $3, %xmm5, %eax
-; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2
+; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm3
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm6, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5
+; AVX512F-NEXT: vmovd %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vpextrd $1, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
+; AVX512F-NEXT: vpextrd $1, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vpextrd $2, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
+; AVX512F-NEXT: vpextrd $2, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vpextrd $3, %xmm2, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
+; AVX512F-NEXT: vpextrd $3, %xmm3, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3
; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
; AVX512F-NEXT: vpextrd $3, %xmm1, %eax
; AVX512F-NEXT: andl $63, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1
-; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512F-NEXT: vcvtdq2ps %zmm2, %zmm2
+; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
+; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1
-; AVX512F-NEXT: vpmovsxbd %xmm8, %zmm3
-; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3
-; AVX512F-NEXT: vmovaps %zmm3, 192(%rdi)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0
+; AVX512F-NEXT: vmovaps %zmm0, 192(%rdi)
; AVX512F-NEXT: vmovaps %zmm1, 128(%rdi)
-; AVX512F-NEXT: vmovaps %zmm2, 64(%rdi)
-; AVX512F-NEXT: vmovaps %zmm0, (%rdi)
+; AVX512F-NEXT: vmovaps %zmm3, 64(%rdi)
+; AVX512F-NEXT: vmovaps %zmm2, (%rdi)
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: vzeroupper
@@ -1373,7 +1373,7 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b)
; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax
; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm8
+; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
; AVX512BW-NEXT: vmovd %xmm5, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
@@ -1464,98 +1464,98 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b)
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7
; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm8
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7
-; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vmovd %xmm8, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrd $1, %xmm0, %eax
+; AVX512BW-NEXT: vpextrd $1, %xmm8, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrd $2, %xmm0, %eax
+; AVX512BW-NEXT: vpextrd $2, %xmm8, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrd $3, %xmm0, %eax
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm0
+; AVX512BW-NEXT: vpextrd $3, %xmm8, %eax
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm8
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7
-; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vmovd %xmm8, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrd $1, %xmm0, %eax
+; AVX512BW-NEXT: vpextrd $1, %xmm8, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrd $2, %xmm0, %eax
+; AVX512BW-NEXT: vpextrd $2, %xmm8, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrd $3, %xmm0, %eax
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm0
+; AVX512BW-NEXT: vpextrd $3, %xmm8, %eax
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm2
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm7, %xmm2
-; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7
+; AVX512BW-NEXT: vmovd %xmm2, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrd $1, %xmm0, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7
+; AVX512BW-NEXT: vpextrd $1, %xmm2, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrd $2, %xmm0, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7
+; AVX512BW-NEXT: vpextrd $2, %xmm2, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrd $3, %xmm0, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7
+; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2
; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm3
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm6, %xmm3
-; AVX512BW-NEXT: vmovd %xmm2, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5
+; AVX512BW-NEXT: vmovd %xmm3, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrd $1, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
+; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrd $2, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
+; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
+; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3
; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax
; AVX512BW-NEXT: andl $63, %eax
; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1
-; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512BW-NEXT: vcvtdq2ps %zmm0, %zmm0
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1
; AVX512BW-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512BW-NEXT: vcvtdq2ps %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovsxbd %xmm3, %zmm3
+; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3
; AVX512BW-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512BW-NEXT: vcvtdq2ps %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovsxbd %xmm8, %zmm3
-; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3
-; AVX512BW-NEXT: vmovaps %zmm3, 192(%rdi)
+; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512BW-NEXT: vcvtdq2ps %zmm0, %zmm0
+; AVX512BW-NEXT: vmovaps %zmm0, 192(%rdi)
; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdi)
-; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdi)
-; AVX512BW-NEXT: vmovaps %zmm0, (%rdi)
+; AVX512BW-NEXT: vmovaps %zmm3, 64(%rdi)
+; AVX512BW-NEXT: vmovaps %zmm2, (%rdi)
; AVX512BW-NEXT: movq %rbp, %rsp
; AVX512BW-NEXT: popq %rbp
; AVX512BW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 5bbd634068c08..7f1ee178009b7 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4786,54 +4786,54 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovapd (%rdi), %ymm2
; AVX1-NEXT: vmovapd 32(%rdi), %ymm3
-; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [1,1,1,1]
-; AVX1-NEXT: vandpd %ymm3, %ymm8, %ymm5
-; AVX1-NEXT: vmovdqa (%rdi), %xmm9
+; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; AVX1-NEXT: vandpd %ymm4, %ymm3, %ymm5
+; AVX1-NEXT: vmovaps (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm6, %xmm7
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4
-; AVX1-NEXT: vpsrlq $1, %xmm4, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX1-NEXT: vorpd %ymm5, %ymm0, %ymm0
-; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm3, %ymm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm3
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[2,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8
+; AVX1-NEXT: vpsrlq $1, %xmm8, %xmm9
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7
+; AVX1-NEXT: vorpd %ymm5, %ymm7, %ymm5
+; AVX1-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3
+; AVX1-NEXT: vpextrq $1, %xmm3, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3]
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
-; AVX1-NEXT: vaddps %xmm0, %xmm0, %xmm3
-; AVX1-NEXT: vpackssdw %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vblendvps %xmm4, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vandpd %ymm2, %ymm8, %ymm3
-; AVX1-NEXT: vpsrlq $1, %xmm9, %xmm4
-; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; AVX1-NEXT: vorpd %ymm3, %ymm4, %ymm3
-; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm2, %ymm2
-; AVX1-NEXT: vpextrq $1, %xmm2, %rax
+; AVX1-NEXT: vmovq %xmm3, %rax
+; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm7
+; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vmovq %xmm3, %rax
+; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm7
+; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
+; AVX1-NEXT: vpextrq $1, %xmm3, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm3
-; AVX1-NEXT: vmovq %xmm2, %rax
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[0]
+; AVX1-NEXT: vaddps %xmm3, %xmm3, %xmm5
+; AVX1-NEXT: vpackssdw %xmm8, %xmm6, %xmm6
+; AVX1-NEXT: vblendvps %xmm6, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandpd %ymm4, %ymm2, %ymm4
+; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm5
+; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: vorpd %ymm4, %ymm5, %ymm4
+; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; AVX1-NEXT: vmovq %xmm2, %rax
+; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vmovq %xmm2, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0],xmm4[3]
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm2
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0]
-; AVX1-NEXT: vaddps %xmm2, %xmm2, %xmm3
-; AVX1-NEXT: vpackssdw %xmm1, %xmm9, %xmm1
-; AVX1-NEXT: vblendvps %xmm1, %xmm3, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[0]
+; AVX1-NEXT: vaddps %xmm2, %xmm2, %xmm4
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, %xmm4, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_load_8i64_to_8f32:
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index bbdd6bf0b5ac2..88f36137fed22 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -455,8 +455,8 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT: vpcmpgtd %xmm8, %xmm7, %xmm7
+; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7
; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
@@ -465,26 +465,26 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
+; AVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm8, %xmm8
+; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi)
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi)
; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi)
; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
@@ -1038,110 +1038,110 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind {
; SSE2-LABEL: saddo_v2i128:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: addq %r8, %rdi
; SSE2-NEXT: adcq %r9, %rsi
; SSE2-NEXT: seto %r8b
; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: seto %al
-; SSE2-NEXT: movzbl %al, %eax
-; SSE2-NEXT: negl %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: movzbl %r8b, %eax
-; SSE2-NEXT: negl %eax
-; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: seto %r9b
+; SSE2-NEXT: movzbl %r9b, %r9d
+; SSE2-NEXT: negl %r9d
+; SSE2-NEXT: movd %r9d, %xmm1
+; SSE2-NEXT: movzbl %r8b, %r8d
+; SSE2-NEXT: negl %r8d
+; SSE2-NEXT: movd %r8d, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %rdx, 16(%r10)
-; SSE2-NEXT: movq %rdi, (%r10)
-; SSE2-NEXT: movq %rcx, 24(%r10)
-; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: movq %rdx, 16(%rax)
+; SSE2-NEXT: movq %rdi, (%rax)
+; SSE2-NEXT: movq %rcx, 24(%rax)
+; SSE2-NEXT: movq %rsi, 8(%rax)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: saddo_v2i128:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSSE3-NEXT: addq %r8, %rdi
; SSSE3-NEXT: adcq %r9, %rsi
; SSSE3-NEXT: seto %r8b
; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: seto %al
-; SSSE3-NEXT: movzbl %al, %eax
-; SSSE3-NEXT: negl %eax
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: movzbl %r8b, %eax
-; SSSE3-NEXT: negl %eax
-; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: seto %r9b
+; SSSE3-NEXT: movzbl %r9b, %r9d
+; SSSE3-NEXT: negl %r9d
+; SSSE3-NEXT: movd %r9d, %xmm1
+; SSSE3-NEXT: movzbl %r8b, %r8d
+; SSSE3-NEXT: negl %r8d
+; SSSE3-NEXT: movd %r8d, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %rdx, 16(%r10)
-; SSSE3-NEXT: movq %rdi, (%r10)
-; SSSE3-NEXT: movq %rcx, 24(%r10)
-; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: movq %rdx, 16(%rax)
+; SSSE3-NEXT: movq %rdi, (%rax)
+; SSSE3-NEXT: movq %rcx, 24(%rax)
+; SSSE3-NEXT: movq %rsi, 8(%rax)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: saddo_v2i128:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE41-NEXT: addq %r8, %rdi
; SSE41-NEXT: adcq %r9, %rsi
; SSE41-NEXT: seto %r8b
; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: seto %al
-; SSE41-NEXT: movzbl %al, %r9d
+; SSE41-NEXT: seto %r9b
+; SSE41-NEXT: movzbl %r9b, %r9d
; SSE41-NEXT: negl %r9d
-; SSE41-NEXT: movzbl %r8b, %eax
-; SSE41-NEXT: negl %eax
-; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: movzbl %r8b, %r8d
+; SSE41-NEXT: negl %r8d
+; SSE41-NEXT: movd %r8d, %xmm0
; SSE41-NEXT: pinsrd $1, %r9d, %xmm0
-; SSE41-NEXT: movq %rdx, 16(%r10)
-; SSE41-NEXT: movq %rdi, (%r10)
-; SSE41-NEXT: movq %rcx, 24(%r10)
-; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: movq %rdx, 16(%rax)
+; SSE41-NEXT: movq %rdi, (%rax)
+; SSE41-NEXT: movq %rcx, 24(%rax)
+; SSE41-NEXT: movq %rsi, 8(%rax)
; SSE41-NEXT: retq
;
; AVX-LABEL: saddo_v2i128:
; AVX: # %bb.0:
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: addq %r8, %rdi
; AVX-NEXT: adcq %r9, %rsi
; AVX-NEXT: seto %r8b
; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: seto %al
-; AVX-NEXT: movzbl %al, %r9d
+; AVX-NEXT: seto %r9b
+; AVX-NEXT: movzbl %r9b, %r9d
; AVX-NEXT: negl %r9d
-; AVX-NEXT: movzbl %r8b, %eax
-; AVX-NEXT: negl %eax
-; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: movzbl %r8b, %r8d
+; AVX-NEXT: negl %r8d
+; AVX-NEXT: vmovd %r8d, %xmm0
; AVX-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0
-; AVX-NEXT: movq %rdx, 16(%r10)
-; AVX-NEXT: movq %rdi, (%r10)
-; AVX-NEXT: movq %rcx, 24(%r10)
-; AVX-NEXT: movq %rsi, 8(%r10)
+; AVX-NEXT: movq %rdx, 16(%rax)
+; AVX-NEXT: movq %rdi, (%rax)
+; AVX-NEXT: movq %rcx, 24(%rax)
+; AVX-NEXT: movq %rsi, 8(%rax)
; AVX-NEXT: retq
;
; AVX512-LABEL: saddo_v2i128:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; AVX512-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: seto %al
-; AVX512-NEXT: kmovd %eax, %k0
+; AVX512-NEXT: seto %r10b
+; AVX512-NEXT: kmovd %r10d, %k0
; AVX512-NEXT: addq %r8, %rdi
; AVX512-NEXT: adcq %r9, %rsi
-; AVX512-NEXT: seto %al
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: seto %r8b
+; AVX512-NEXT: andl $1, %r8d
+; AVX512-NEXT: kmovw %r8d, %k1
; AVX512-NEXT: kshiftlw $1, %k0, %k0
; AVX512-NEXT: korw %k0, %k1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: movq %rdx, 16(%r10)
-; AVX512-NEXT: movq %rdi, (%r10)
-; AVX512-NEXT: movq %rcx, 24(%r10)
-; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: movq %rdx, 16(%rax)
+; AVX512-NEXT: movq %rdi, (%rax)
+; AVX512-NEXT: movq %rcx, 24(%rax)
+; AVX512-NEXT: movq %rsi, 8(%rax)
; AVX512-NEXT: retq
%t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
%val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 2882d576cd061..dbec86755a969 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -436,58 +436,58 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0]
-; SSE2-NEXT: movd %r9d, %xmm10
+; SSE2-NEXT: movd %r9d, %xmm0
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movdqa %xmm10, %xmm9
-; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
-; SSE2-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; SSE2-NEXT: pmuludq %xmm6, %xmm10
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
+; SSE2-NEXT: pmuludq %xmm6, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm8
+; SSE2-NEXT: pand %xmm2, %xmm8
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: paddd %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: psubd %xmm3, %xmm0
+; SSE2-NEXT: pmuludq %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; SSE2-NEXT: psubd %xmm9, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSE2-NEXT: movdqa %xmm2, (%rcx)
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm3
-; SSE2-NEXT: pand %xmm9, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm5, %xmm2
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
+; SSE2-NEXT: pand %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: paddd %xmm3, %xmm7
-; SSE2-NEXT: pmuludq %xmm8, %xmm1
+; SSE2-NEXT: paddd %xmm8, %xmm7
+; SSE2-NEXT: pmuludq %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSE2-NEXT: psubd %xmm7, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT: movq %xmm3, 16(%rcx)
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: movq %xmm3, 16(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movq %xmm0, 16(%rcx)
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm5, %xmm0
+; SSE2-NEXT: movq %xmm0, 16(%rdi)
; SSE2-NEXT: movdqa %xmm2, (%rdi)
; SSE2-NEXT: retq
;
@@ -508,58 +508,58 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0]
-; SSSE3-NEXT: movd %r9d, %xmm10
+; SSSE3-NEXT: movd %r9d, %xmm0
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movdqa %xmm10, %xmm9
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
-; SSSE3-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; SSSE3-NEXT: pmuludq %xmm6, %xmm10
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
+; SSSE3-NEXT: pmuludq %xmm6, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: pand %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: paddd %xmm4, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm8, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8
+; SSSE3-NEXT: pand %xmm2, %xmm8
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9
+; SSSE3-NEXT: pand %xmm5, %xmm9
+; SSSE3-NEXT: paddd %xmm8, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm5, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSSE3-NEXT: psubd %xmm3, %xmm0
+; SSSE3-NEXT: pmuludq %xmm8, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; SSSE3-NEXT: psubd %xmm9, %xmm10
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSSE3-NEXT: movdqa %xmm2, (%rcx)
; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
-; SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3
-; SSSE3-NEXT: pand %xmm9, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
+; SSSE3-NEXT: pxor %xmm5, %xmm2
+; SSSE3-NEXT: pxor %xmm8, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8
+; SSSE3-NEXT: pand %xmm3, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: paddd %xmm3, %xmm7
-; SSSE3-NEXT: pmuludq %xmm8, %xmm1
+; SSSE3-NEXT: paddd %xmm8, %xmm7
+; SSSE3-NEXT: pmuludq %xmm4, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSSE3-NEXT: psubd %xmm7, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSSE3-NEXT: movq %xmm3, 16(%rcx)
-; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
-; SSSE3-NEXT: pxor %xmm0, %xmm3
-; SSSE3-NEXT: movq %xmm3, 16(%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: movq %xmm0, 16(%rcx)
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm5, %xmm0
+; SSSE3-NEXT: movq %xmm0, 16(%rdi)
; SSSE3-NEXT: movdqa %xmm2, (%rdi)
; SSSE3-NEXT: retq
;
@@ -892,84 +892,84 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm11, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
; SSE2-NEXT: psubd %xmm10, %xmm9
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm9
-; SSE2-NEXT: pxor %xmm9, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE2-NEXT: pand %xmm1, %xmm9
; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm10
-; SSE2-NEXT: pand %xmm1, %xmm10
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: paddd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT: pand %xmm5, %xmm10
+; SSE2-NEXT: paddd %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm10, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
-; SSE2-NEXT: psubd %xmm4, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; SSE2-NEXT: psubd %xmm10, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pcmpeqd %xmm11, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: paddd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
+; SSE2-NEXT: pand %xmm6, %xmm9
+; SSE2-NEXT: paddd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm10, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; SSE2-NEXT: psubd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
+; SSE2-NEXT: psubd %xmm9, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE2-NEXT: pand %xmm3, %xmm5
; SSE2-NEXT: pcmpgtd %xmm3, %xmm8
; SSE2-NEXT: pand %xmm7, %xmm8
-; SSE2-NEXT: paddd %xmm4, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT: paddd %xmm5, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; SSE2-NEXT: psubd %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE2-NEXT: psubd %xmm8, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT: pxor %xmm9, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: smulo_v16i32:
@@ -985,84 +985,84 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm4, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm11, %xmm12
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm11, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
; SSSE3-NEXT: psubd %xmm10, %xmm9
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSSE3-NEXT: movdqa %xmm0, (%rdi)
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9
-; SSSE3-NEXT: pxor %xmm9, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm9
+; SSSE3-NEXT: pand %xmm1, %xmm9
; SSSE3-NEXT: pxor %xmm10, %xmm10
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10
-; SSSE3-NEXT: pand %xmm1, %xmm10
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: paddd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10
+; SSSE3-NEXT: pand %xmm5, %xmm10
+; SSSE3-NEXT: paddd %xmm9, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm5, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm10, %xmm12
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
-; SSSE3-NEXT: psubd %xmm4, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm9, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; SSSE3-NEXT: psubd %xmm10, %xmm11
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
; SSSE3-NEXT: movdqa %xmm1, 16(%rdi)
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
-; SSSE3-NEXT: pand %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm1
; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: pand %xmm6, %xmm5
-; SSSE3-NEXT: paddd %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT: pand %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9
+; SSSE3-NEXT: pand %xmm6, %xmm9
+; SSSE3-NEXT: paddd %xmm5, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm10, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; SSSE3-NEXT: psubd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm5, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
+; SSSE3-NEXT: psubd %xmm9, %xmm10
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,2,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSSE3-NEXT: movdqa %xmm2, 32(%rdi)
; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
-; SSSE3-NEXT: pxor %xmm9, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm3, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
+; SSSE3-NEXT: pand %xmm3, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8
; SSSE3-NEXT: pand %xmm7, %xmm8
-; SSSE3-NEXT: paddd %xmm4, %xmm8
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSSE3-NEXT: paddd %xmm5, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm7, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; SSSE3-NEXT: psubd %xmm8, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm5, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSSE3-NEXT: psubd %xmm8, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
; SSSE3-NEXT: movdqa %xmm3, 48(%rdi)
; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT: pxor %xmm9, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3
+; SSSE3-NEXT: pxor %xmm4, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: smulo_v16i32:
@@ -1078,44 +1078,44 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; SSE41-NEXT: movdqa %xmm0, (%rdi)
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm8
-; SSE41-NEXT: pxor %xmm8, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm9, %xmm10
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pmuldq %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3],xmm4[4,5],xmm10[6,7]
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm8, %xmm9
+; SSE41-NEXT: movdqa %xmm1, %xmm8
+; SSE41-NEXT: pmuldq %xmm5, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7]
; SSE41-NEXT: pmulld %xmm5, %xmm1
; SSE41-NEXT: movdqa %xmm1, 16(%rdi)
; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm1
-; SSE41-NEXT: pxor %xmm8, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm4, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pmuldq %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm5, %xmm8
+; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: pmuldq %xmm6, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5],xmm8[6,7]
; SSE41-NEXT: pmulld %xmm6, %xmm2
; SSE41-NEXT: movdqa %xmm2, 32(%rdi)
; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm2
-; SSE41-NEXT: pxor %xmm8, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm4, %xmm5
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pmuldq %xmm7, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm5, %xmm6
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: pmuldq %xmm7, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
; SSE41-NEXT: pmulld %xmm7, %xmm3
; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm3
-; SSE41-NEXT: pxor %xmm8, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: smulo_v16i32:
@@ -1128,45 +1128,45 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX1-NEXT: vpmuldq %xmm4, %xmm6, %xmm7
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
-; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm8
-; AVX1-NEXT: vpsrad $31, %xmm8, %xmm6
+; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpsrad $31, %xmm4, %xmm6
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9
-; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3],xmm7[4,5],xmm4[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7]
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm9, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpmuldq %xmm4, %xmm7, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
-; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpsrad $31, %xmm4, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm7, %xmm9, %xmm7
+; AVX1-NEXT: vpmuldq %xmm6, %xmm8, %xmm9
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3],xmm9[4,5],xmm7[6,7]
+; AVX1-NEXT: vpmulld %xmm6, %xmm8, %xmm6
+; AVX1-NEXT: vpsrad $31, %xmm6, %xmm8
+; AVX1-NEXT: vpcmpeqd %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm9
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5],xmm8[6,7]
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm9, %xmm0
-; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm8, %xmm0
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -1177,9 +1177,9 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
-; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi)
+; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi)
; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi)
-; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
; AVX1-NEXT: retq
;
@@ -1477,44 +1477,44 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; SSE2-NEXT: psrlw $8, %xmm5
; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
-; SSE2-NEXT: pmulhw %xmm7, %xmm10
-; SSE2-NEXT: movdqa %xmm10, %xmm7
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT: pmulhw %xmm7, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm7
; SSE2-NEXT: psrlw $8, %xmm7
; SSE2-NEXT: packuswb %xmm5, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm9, %xmm6
-; SSE2-NEXT: pand %xmm9, %xmm10
-; SSE2-NEXT: packuswb %xmm6, %xmm10
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm10, %xmm3
-; SSE2-NEXT: pcmpeqb %xmm7, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
-; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm7, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT: pxor %xmm6, %xmm1
; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15]
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT: pmulhw %xmm7, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
+; SSE2-NEXT: pmulhw %xmm7, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm7
; SSE2-NEXT: psrlw $8, %xmm7
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT: pmulhw %xmm5, %xmm11
-; SSE2-NEXT: movdqa %xmm11, %xmm0
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: pmulhw %xmm9, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm9, %xmm6
-; SSE2-NEXT: pand %xmm9, %xmm11
-; SSE2-NEXT: packuswb %xmm6, %xmm11
-; SSE2-NEXT: pcmpgtb %xmm11, %xmm4
+; SSE2-NEXT: pand %xmm5, %xmm8
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: packuswb %xmm8, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
; SSE2-NEXT: pcmpeqb %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm4
@@ -1527,30 +1527,30 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm7
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm7
; SSE2-NEXT: psrad $31, %xmm7
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $31, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; SSE2-NEXT: psrad $24, %xmm2
-; SSE2-NEXT: movdqa %xmm10, 16(%rsi)
-; SSE2-NEXT: movdqa %xmm11, (%rsi)
-; SSE2-NEXT: movdqa %xmm2, 64(%rdi)
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: psrad $24, %xmm8
+; SSE2-NEXT: movdqa %xmm3, 16(%rsi)
+; SSE2-NEXT: movdqa %xmm2, (%rsi)
+; SSE2-NEXT: movdqa %xmm8, 64(%rdi)
; SSE2-NEXT: movdqa %xmm5, (%rdi)
-; SSE2-NEXT: movdqa %xmm3, 112(%rdi)
-; SSE2-NEXT: movdqa %xmm1, 96(%rdi)
+; SSE2-NEXT: movdqa %xmm1, 112(%rdi)
+; SSE2-NEXT: movdqa %xmm9, 96(%rdi)
; SSE2-NEXT: movdqa %xmm7, 80(%rdi)
; SSE2-NEXT: movdqa %xmm0, 48(%rdi)
; SSE2-NEXT: movdqa %xmm6, 32(%rdi)
@@ -1570,44 +1570,44 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: psrlw $8, %xmm5
; SSSE3-NEXT: pxor %xmm7, %xmm7
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; SSSE3-NEXT: pxor %xmm10, %xmm10
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
-; SSSE3-NEXT: pmulhw %xmm7, %xmm10
-; SSSE3-NEXT: movdqa %xmm10, %xmm7
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSSE3-NEXT: pmulhw %xmm7, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm7
; SSSE3-NEXT: psrlw $8, %xmm7
; SSSE3-NEXT: packuswb %xmm5, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
-; SSSE3-NEXT: pand %xmm9, %xmm6
-; SSSE3-NEXT: pand %xmm9, %xmm10
-; SSSE3-NEXT: packuswb %xmm6, %xmm10
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtb %xmm10, %xmm3
-; SSSE3-NEXT: pcmpeqb %xmm7, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8
-; SSSE3-NEXT: pxor %xmm8, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSSE3-NEXT: pand %xmm5, %xmm6
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: packuswb %xmm6, %xmm3
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pcmpgtb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm7, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6
+; SSSE3-NEXT: pxor %xmm6, %xmm1
; SSSE3-NEXT: pxor %xmm7, %xmm7
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15]
-; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSSE3-NEXT: pmulhw %xmm7, %xmm6
-; SSSE3-NEXT: movdqa %xmm6, %xmm7
+; SSSE3-NEXT: pxor %xmm8, %xmm8
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
+; SSSE3-NEXT: pmulhw %xmm7, %xmm8
+; SSSE3-NEXT: movdqa %xmm8, %xmm7
; SSSE3-NEXT: psrlw $8, %xmm7
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSSE3-NEXT: pxor %xmm11, %xmm11
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSSE3-NEXT: pmulhw %xmm5, %xmm11
-; SSSE3-NEXT: movdqa %xmm11, %xmm0
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT: pmulhw %xmm9, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: psrlw $8, %xmm0
; SSSE3-NEXT: packuswb %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm9, %xmm6
-; SSSE3-NEXT: pand %xmm9, %xmm11
-; SSSE3-NEXT: packuswb %xmm6, %xmm11
-; SSSE3-NEXT: pcmpgtb %xmm11, %xmm4
+; SSSE3-NEXT: pand %xmm5, %xmm8
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: packuswb %xmm8, %xmm2
+; SSSE3-NEXT: pcmpgtb %xmm2, %xmm4
; SSSE3-NEXT: pcmpeqb %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: pxor %xmm6, %xmm4
; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pslld $31, %xmm4
@@ -1620,30 +1620,30 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pslld $31, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm7
+; SSSE3-NEXT: movdqa %xmm1, %xmm7
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pslld $31, %xmm7
; SSSE3-NEXT: psrad $31, %xmm7
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT: movdqa %xmm1, %xmm9
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: pslld $31, %xmm9
+; SSSE3-NEXT: psrad $31, %xmm9
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pslld $31, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: pslld $31, %xmm3
-; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $24, %xmm5
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; SSSE3-NEXT: psrad $24, %xmm2
-; SSSE3-NEXT: movdqa %xmm10, 16(%rsi)
-; SSSE3-NEXT: movdqa %xmm11, (%rsi)
-; SSSE3-NEXT: movdqa %xmm2, 64(%rdi)
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm8
+; SSSE3-NEXT: movdqa %xmm3, 16(%rsi)
+; SSSE3-NEXT: movdqa %xmm2, (%rsi)
+; SSSE3-NEXT: movdqa %xmm8, 64(%rdi)
; SSSE3-NEXT: movdqa %xmm5, (%rdi)
-; SSSE3-NEXT: movdqa %xmm3, 112(%rdi)
-; SSSE3-NEXT: movdqa %xmm1, 96(%rdi)
+; SSSE3-NEXT: movdqa %xmm1, 112(%rdi)
+; SSSE3-NEXT: movdqa %xmm9, 96(%rdi)
; SSSE3-NEXT: movdqa %xmm7, 80(%rdi)
; SSSE3-NEXT: movdqa %xmm0, 48(%rdi)
; SSSE3-NEXT: movdqa %xmm6, 32(%rdi)
@@ -1653,7 +1653,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; SSE41-LABEL: smulo_v32i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
-; SSE41-NEXT: pxor %xmm10, %xmm10
+; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: pxor %xmm5, %xmm5
; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
; SSE41-NEXT: pxor %xmm6, %xmm6
@@ -1669,45 +1669,45 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; SSE41-NEXT: movdqa %xmm3, %xmm7
; SSE41-NEXT: psrlw $8, %xmm7
; SSE41-NEXT: packuswb %xmm5, %xmm7
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pand %xmm9, %xmm6
-; SSE41-NEXT: pand %xmm9, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm5, %xmm6
+; SSE41-NEXT: pand %xmm5, %xmm3
; SSE41-NEXT: packuswb %xmm6, %xmm3
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pcmpgtb %xmm3, %xmm1
; SSE41-NEXT: pcmpeqb %xmm7, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm8
-; SSE41-NEXT: pxor %xmm8, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
+; SSE41-NEXT: pxor %xmm6, %xmm1
; SSE41-NEXT: pxor %xmm7, %xmm7
; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15]
-; SSE41-NEXT: pxor %xmm6, %xmm6
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE41-NEXT: pmulhw %xmm7, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pxor %xmm8, %xmm8
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
+; SSE41-NEXT: pmulhw %xmm7, %xmm8
+; SSE41-NEXT: movdqa %xmm8, %xmm7
; SSE41-NEXT: psrlw $8, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; SSE41-NEXT: pxor %xmm9, %xmm9
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE41-NEXT: pmulhw %xmm5, %xmm2
+; SSE41-NEXT: pmulhw %xmm9, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: packuswb %xmm7, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm6
-; SSE41-NEXT: pand %xmm9, %xmm2
-; SSE41-NEXT: packuswb %xmm6, %xmm2
-; SSE41-NEXT: pcmpgtb %xmm2, %xmm10
-; SSE41-NEXT: pcmpeqb %xmm0, %xmm10
-; SSE41-NEXT: pxor %xmm8, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pslld $31, %xmm8
-; SSE41-NEXT: psrad $31, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3]
+; SSE41-NEXT: pand %xmm5, %xmm8
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: packuswb %xmm8, %xmm2
+; SSE41-NEXT: pcmpgtb %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm6, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm5
; SSE41-NEXT: psrad $31, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[3,3,3,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm6
; SSE41-NEXT: psrad $31, %xmm6
@@ -1715,26 +1715,26 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm7
; SSE41-NEXT: psrad $31, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pslld $31, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; SSE41-NEXT: pslld $31, %xmm4
-; SSE41-NEXT: psrad $31, %xmm4
-; SSE41-NEXT: pmovsxbd %xmm10, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm8
+; SSE41-NEXT: psrad $31, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm9
+; SSE41-NEXT: psrad $31, %xmm9
+; SSE41-NEXT: pmovsxbd %xmm4, %xmm4
; SSE41-NEXT: pmovsxbd %xmm1, %xmm1
; SSE41-NEXT: movdqa %xmm3, 16(%rsi)
; SSE41-NEXT: movdqa %xmm2, (%rsi)
; SSE41-NEXT: movdqa %xmm1, 64(%rdi)
-; SSE41-NEXT: movdqa %xmm9, (%rdi)
-; SSE41-NEXT: movdqa %xmm4, 112(%rdi)
-; SSE41-NEXT: movdqa %xmm0, 96(%rdi)
+; SSE41-NEXT: movdqa %xmm4, (%rdi)
+; SSE41-NEXT: movdqa %xmm9, 112(%rdi)
+; SSE41-NEXT: movdqa %xmm8, 96(%rdi)
; SSE41-NEXT: movdqa %xmm7, 80(%rdi)
; SSE41-NEXT: movdqa %xmm6, 48(%rdi)
; SSE41-NEXT: movdqa %xmm5, 32(%rdi)
-; SSE41-NEXT: movdqa %xmm8, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: smulo_v32i8:
@@ -1757,43 +1757,43 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm4
; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpxor %xmm3, %xmm8, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX1-NEXT: vpmulhw %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm7
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX1-NEXT: vpmulhw %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm8
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; AVX1-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vpcmpgtb %xmm5, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm7, %xmm0, %xmm6
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm8, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
-; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1]
+; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm5, (%rdi)
+; AVX1-NEXT: vmovdqa %xmm6, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: smulo_v32i8:
@@ -1891,94 +1891,94 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; SSE2-NEXT: psrlw $8, %xmm8
; SSE2-NEXT: pxor %xmm10, %xmm10
; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
-; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
-; SSE2-NEXT: pmulhw %xmm10, %xmm11
-; SSE2-NEXT: movdqa %xmm11, %xmm7
-; SSE2-NEXT: psrlw $8, %xmm7
-; SSE2-NEXT: packuswb %xmm8, %xmm7
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; SSE2-NEXT: pmulhw %xmm10, %xmm7
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: psrlw $8, %xmm10
+; SSE2-NEXT: packuswb %xmm8, %xmm10
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm8, %xmm9
-; SSE2-NEXT: pand %xmm8, %xmm11
-; SSE2-NEXT: packuswb %xmm9, %xmm11
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpgtb %xmm11, %xmm9
-; SSE2-NEXT: pcmpeqb %xmm7, %xmm9
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: packuswb %xmm9, %xmm7
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: pmulhw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm3
+; SSE2-NEXT: pcmpeqb %xmm10, %xmm3
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
+; SSE2-NEXT: pmulhw %xmm9, %xmm10
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm10, %xmm2
; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmulhw %xmm7, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: psrlw $8, %xmm7
-; SSE2-NEXT: packuswb %xmm2, %xmm7
-; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: pmulhw %xmm9, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm9
+; SSE2-NEXT: psrlw $8, %xmm9
+; SSE2-NEXT: packuswb %xmm2, %xmm9
+; SSE2-NEXT: pand %xmm8, %xmm10
; SSE2-NEXT: pand %xmm8, %xmm6
-; SSE2-NEXT: packuswb %xmm3, %xmm6
+; SSE2-NEXT: packuswb %xmm10, %xmm6
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpgtb %xmm6, %xmm2
-; SSE2-NEXT: pcmpeqb %xmm7, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
-; SSE2-NEXT: pmulhw %xmm3, %xmm7
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT: pcmpeqb %xmm9, %xmm2
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15]
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15]
+; SSE2-NEXT: pmulhw %xmm9, %xmm10
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm7, %xmm1
+; SSE2-NEXT: movdqa %xmm10, %xmm1
; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmulhw %xmm3, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: packuswb %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pmulhw %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: psrlw $8, %xmm9
+; SSE2-NEXT: packuswb %xmm1, %xmm9
+; SSE2-NEXT: pand %xmm8, %xmm10
; SSE2-NEXT: pand %xmm8, %xmm5
-; SSE2-NEXT: packuswb %xmm7, %xmm5
+; SSE2-NEXT: packuswb %xmm10, %xmm5
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpgtb %xmm5, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSE2-NEXT: pmulhw %xmm3, %xmm7
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: pcmpeqb %xmm9, %xmm1
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; SSE2-NEXT: pmulhw %xmm9, %xmm10
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: movdqa %xmm10, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pmulhw %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: packuswb %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pmulhw %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: psrlw $8, %xmm9
+; SSE2-NEXT: packuswb %xmm0, %xmm9
+; SSE2-NEXT: pand %xmm8, %xmm10
; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: packuswb %xmm7, %xmm4
+; SSE2-NEXT: packuswb %xmm10, %xmm4
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpgtb %xmm4, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm3, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm3, %xmm9
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm11, 48(%rsi)
+; SSE2-NEXT: pcmpeqb %xmm9, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm7, 48(%rsi)
; SSE2-NEXT: movdqa %xmm6, 32(%rsi)
; SSE2-NEXT: movdqa %xmm5, 16(%rsi)
-; SSE2-NEXT: movdqa %xmm9, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: movdqa %xmm4, (%rsi)
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm4
; SSE2-NEXT: movdqa %xmm4, 192(%rdi)
@@ -1994,31 +1994,31 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm4
; SSE2-NEXT: movdqa %xmm4, (%rdi)
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pslld $31, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: movdqa %xmm3, 224(%rdi)
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm5
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: movdqa %xmm5, 224(%rdi)
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: movdqa %xmm4, 240(%rdi)
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $31, %xmm9
-; SSE2-NEXT: psrad $31, %xmm9
-; SSE2-NEXT: movdqa %xmm9, 208(%rdi)
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: movdqa %xmm3, 160(%rdi)
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm3, 208(%rdi)
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $31, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: movdqa %xmm4, 176(%rdi)
+; SSE2-NEXT: movdqa %xmm4, 160(%rdi)
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: movdqa %xmm3, 176(%rdi)
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -2070,94 +2070,94 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: psrlw $8, %xmm8
; SSSE3-NEXT: pxor %xmm10, %xmm10
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
-; SSSE3-NEXT: pxor %xmm11, %xmm11
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
-; SSSE3-NEXT: pmulhw %xmm10, %xmm11
-; SSSE3-NEXT: movdqa %xmm11, %xmm7
-; SSSE3-NEXT: psrlw $8, %xmm7
-; SSSE3-NEXT: packuswb %xmm8, %xmm7
+; SSSE3-NEXT: pxor %xmm7, %xmm7
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; SSSE3-NEXT: pmulhw %xmm10, %xmm7
+; SSSE3-NEXT: movdqa %xmm7, %xmm10
+; SSSE3-NEXT: psrlw $8, %xmm10
+; SSSE3-NEXT: packuswb %xmm8, %xmm10
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
; SSSE3-NEXT: pand %xmm8, %xmm9
-; SSSE3-NEXT: pand %xmm8, %xmm11
-; SSSE3-NEXT: packuswb %xmm9, %xmm11
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpgtb %xmm11, %xmm9
-; SSSE3-NEXT: pcmpeqb %xmm7, %xmm9
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; SSSE3-NEXT: pand %xmm8, %xmm7
+; SSSE3-NEXT: packuswb %xmm9, %xmm7
; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSSE3-NEXT: pmulhw %xmm7, %xmm3
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSSE3-NEXT: pcmpgtb %xmm7, %xmm3
+; SSSE3-NEXT: pcmpeqb %xmm10, %xmm3
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; SSSE3-NEXT: pxor %xmm10, %xmm10
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
+; SSSE3-NEXT: pmulhw %xmm9, %xmm10
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
; SSSE3-NEXT: pxor %xmm6, %xmm6
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm10, %xmm2
; SSSE3-NEXT: psrlw $8, %xmm2
-; SSSE3-NEXT: pmulhw %xmm7, %xmm6
-; SSSE3-NEXT: movdqa %xmm6, %xmm7
-; SSSE3-NEXT: psrlw $8, %xmm7
-; SSSE3-NEXT: packuswb %xmm2, %xmm7
-; SSSE3-NEXT: pand %xmm8, %xmm3
+; SSSE3-NEXT: pmulhw %xmm9, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: psrlw $8, %xmm9
+; SSSE3-NEXT: packuswb %xmm2, %xmm9
+; SSSE3-NEXT: pand %xmm8, %xmm10
; SSSE3-NEXT: pand %xmm8, %xmm6
-; SSSE3-NEXT: packuswb %xmm3, %xmm6
+; SSSE3-NEXT: packuswb %xmm10, %xmm6
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pcmpgtb %xmm6, %xmm2
-; SSSE3-NEXT: pcmpeqb %xmm7, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
-; SSSE3-NEXT: pmulhw %xmm3, %xmm7
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSSE3-NEXT: pcmpeqb %xmm9, %xmm2
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15]
+; SSSE3-NEXT: pxor %xmm10, %xmm10
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15]
+; SSSE3-NEXT: pmulhw %xmm9, %xmm10
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
; SSSE3-NEXT: pxor %xmm5, %xmm5
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSSE3-NEXT: movdqa %xmm7, %xmm1
+; SSSE3-NEXT: movdqa %xmm10, %xmm1
; SSSE3-NEXT: psrlw $8, %xmm1
-; SSSE3-NEXT: pmulhw %xmm3, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm3
-; SSSE3-NEXT: psrlw $8, %xmm3
-; SSSE3-NEXT: packuswb %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm8, %xmm7
+; SSSE3-NEXT: pmulhw %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm9
+; SSSE3-NEXT: psrlw $8, %xmm9
+; SSSE3-NEXT: packuswb %xmm1, %xmm9
+; SSSE3-NEXT: pand %xmm8, %xmm10
; SSSE3-NEXT: pand %xmm8, %xmm5
-; SSSE3-NEXT: packuswb %xmm7, %xmm5
+; SSSE3-NEXT: packuswb %xmm10, %xmm5
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: pcmpgtb %xmm5, %xmm1
-; SSSE3-NEXT: pcmpeqb %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSSE3-NEXT: pmulhw %xmm3, %xmm7
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT: pcmpeqb %xmm9, %xmm1
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
+; SSSE3-NEXT: pxor %xmm10, %xmm10
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; SSSE3-NEXT: pmulhw %xmm9, %xmm10
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
; SSSE3-NEXT: pxor %xmm4, %xmm4
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSSE3-NEXT: movdqa %xmm7, %xmm0
+; SSSE3-NEXT: movdqa %xmm10, %xmm0
; SSSE3-NEXT: psrlw $8, %xmm0
-; SSSE3-NEXT: pmulhw %xmm3, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm3
-; SSSE3-NEXT: psrlw $8, %xmm3
-; SSSE3-NEXT: packuswb %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm8, %xmm7
+; SSSE3-NEXT: pmulhw %xmm9, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm9
+; SSSE3-NEXT: psrlw $8, %xmm9
+; SSSE3-NEXT: packuswb %xmm0, %xmm9
+; SSSE3-NEXT: pand %xmm8, %xmm10
; SSSE3-NEXT: pand %xmm8, %xmm4
-; SSSE3-NEXT: packuswb %xmm7, %xmm4
+; SSSE3-NEXT: packuswb %xmm10, %xmm4
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pcmpgtb %xmm4, %xmm0
-; SSSE3-NEXT: pcmpeqb %xmm3, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT: pxor %xmm3, %xmm9
-; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm0
-; SSSE3-NEXT: movdqa %xmm11, 48(%rsi)
+; SSSE3-NEXT: pcmpeqb %xmm9, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8
+; SSSE3-NEXT: pxor %xmm8, %xmm3
+; SSSE3-NEXT: pxor %xmm8, %xmm2
+; SSSE3-NEXT: pxor %xmm8, %xmm1
+; SSSE3-NEXT: pxor %xmm8, %xmm0
+; SSSE3-NEXT: movdqa %xmm7, 48(%rsi)
; SSSE3-NEXT: movdqa %xmm6, 32(%rsi)
; SSSE3-NEXT: movdqa %xmm5, 16(%rsi)
-; SSSE3-NEXT: movdqa %xmm9, %xmm3
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSSE3-NEXT: movdqa %xmm4, (%rsi)
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $24, %xmm4
; SSSE3-NEXT: movdqa %xmm4, 192(%rdi)
@@ -2173,31 +2173,31 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $24, %xmm4
; SSSE3-NEXT: movdqa %xmm4, (%rdi)
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: pslld $31, %xmm3
-; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, 224(%rdi)
+; SSSE3-NEXT: movdqa %xmm5, %xmm4
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: pslld $31, %xmm5
+; SSSE3-NEXT: psrad $31, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, 224(%rdi)
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pslld $31, %xmm4
; SSSE3-NEXT: psrad $31, %xmm4
; SSSE3-NEXT: movdqa %xmm4, 240(%rdi)
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: pslld $31, %xmm9
-; SSSE3-NEXT: psrad $31, %xmm9
-; SSSE3-NEXT: movdqa %xmm9, 208(%rdi)
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pslld $31, %xmm3
; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, 160(%rdi)
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: movdqa %xmm3, 208(%rdi)
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: pslld $31, %xmm4
; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, 176(%rdi)
+; SSSE3-NEXT: movdqa %xmm4, 160(%rdi)
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: pslld $31, %xmm3
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, 176(%rdi)
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -2249,110 +2249,110 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; SSE41-NEXT: psrlw $8, %xmm8
; SSE41-NEXT: pxor %xmm10, %xmm10
; SSE41-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
-; SSE41-NEXT: pxor %xmm11, %xmm11
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
-; SSE41-NEXT: pmulhw %xmm10, %xmm11
-; SSE41-NEXT: movdqa %xmm11, %xmm7
-; SSE41-NEXT: psrlw $8, %xmm7
-; SSE41-NEXT: packuswb %xmm8, %xmm7
+; SSE41-NEXT: pxor %xmm7, %xmm7
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; SSE41-NEXT: pmulhw %xmm10, %xmm7
+; SSE41-NEXT: movdqa %xmm7, %xmm10
+; SSE41-NEXT: psrlw $8, %xmm10
+; SSE41-NEXT: packuswb %xmm8, %xmm10
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
; SSE41-NEXT: pand %xmm8, %xmm9
-; SSE41-NEXT: pand %xmm8, %xmm11
-; SSE41-NEXT: packuswb %xmm9, %xmm11
-; SSE41-NEXT: pxor %xmm9, %xmm9
-; SSE41-NEXT: pcmpgtb %xmm11, %xmm9
-; SSE41-NEXT: pcmpeqb %xmm7, %xmm9
-; SSE41-NEXT: pxor %xmm7, %xmm7
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; SSE41-NEXT: pand %xmm8, %xmm7
+; SSE41-NEXT: packuswb %xmm9, %xmm7
; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE41-NEXT: pmulhw %xmm7, %xmm3
-; SSE41-NEXT: pxor %xmm7, %xmm7
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE41-NEXT: pcmpgtb %xmm7, %xmm3
+; SSE41-NEXT: pcmpeqb %xmm10, %xmm3
+; SSE41-NEXT: pxor %xmm9, %xmm9
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; SSE41-NEXT: pxor %xmm10, %xmm10
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
+; SSE41-NEXT: pmulhw %xmm9, %xmm10
+; SSE41-NEXT: pxor %xmm9, %xmm9
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm10, %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pmulhw %xmm7, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: psrlw $8, %xmm7
-; SSE41-NEXT: packuswb %xmm2, %xmm7
-; SSE41-NEXT: pand %xmm8, %xmm3
+; SSE41-NEXT: pmulhw %xmm9, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: psrlw $8, %xmm9
+; SSE41-NEXT: packuswb %xmm2, %xmm9
+; SSE41-NEXT: pand %xmm8, %xmm10
; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: packuswb %xmm3, %xmm6
+; SSE41-NEXT: packuswb %xmm10, %xmm6
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pcmpgtb %xmm6, %xmm2
-; SSE41-NEXT: pcmpeqb %xmm7, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; SSE41-NEXT: pxor %xmm7, %xmm7
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
-; SSE41-NEXT: pmulhw %xmm3, %xmm7
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE41-NEXT: pcmpeqb %xmm9, %xmm2
+; SSE41-NEXT: pxor %xmm9, %xmm9
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15]
+; SSE41-NEXT: pxor %xmm10, %xmm10
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15]
+; SSE41-NEXT: pmulhw %xmm9, %xmm10
+; SSE41-NEXT: pxor %xmm9, %xmm9
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
; SSE41-NEXT: pxor %xmm5, %xmm5
; SSE41-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE41-NEXT: movdqa %xmm7, %xmm1
+; SSE41-NEXT: movdqa %xmm10, %xmm1
; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmulhw %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: packuswb %xmm1, %xmm3
-; SSE41-NEXT: pand %xmm8, %xmm7
+; SSE41-NEXT: pmulhw %xmm9, %xmm5
+; SSE41-NEXT: movdqa %xmm5, %xmm9
+; SSE41-NEXT: psrlw $8, %xmm9
+; SSE41-NEXT: packuswb %xmm1, %xmm9
+; SSE41-NEXT: pand %xmm8, %xmm10
; SSE41-NEXT: pand %xmm8, %xmm5
-; SSE41-NEXT: packuswb %xmm7, %xmm5
+; SSE41-NEXT: packuswb %xmm10, %xmm5
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pcmpgtb %xmm5, %xmm1
-; SSE41-NEXT: pcmpeqb %xmm3, %xmm1
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE41-NEXT: pxor %xmm7, %xmm7
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSE41-NEXT: pmulhw %xmm3, %xmm7
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE41-NEXT: pcmpeqb %xmm9, %xmm1
+; SSE41-NEXT: pxor %xmm9, %xmm9
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
+; SSE41-NEXT: pxor %xmm10, %xmm10
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; SSE41-NEXT: pmulhw %xmm9, %xmm10
+; SSE41-NEXT: pxor %xmm9, %xmm9
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: movdqa %xmm10, %xmm0
; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmulhw %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: packuswb %xmm0, %xmm3
-; SSE41-NEXT: pand %xmm8, %xmm7
+; SSE41-NEXT: pmulhw %xmm9, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm9
+; SSE41-NEXT: psrlw $8, %xmm9
+; SSE41-NEXT: packuswb %xmm0, %xmm9
+; SSE41-NEXT: pand %xmm8, %xmm10
; SSE41-NEXT: pand %xmm8, %xmm4
-; SSE41-NEXT: packuswb %xmm7, %xmm4
+; SSE41-NEXT: packuswb %xmm10, %xmm4
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pcmpgtb %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqb %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT: pxor %xmm3, %xmm9
-; SSE41-NEXT: pxor %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm1
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm11, 48(%rsi)
+; SSE41-NEXT: pcmpeqb %xmm9, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE41-NEXT: pxor %xmm8, %xmm3
+; SSE41-NEXT: pxor %xmm8, %xmm2
+; SSE41-NEXT: pxor %xmm8, %xmm1
+; SSE41-NEXT: pxor %xmm8, %xmm0
+; SSE41-NEXT: movdqa %xmm7, 48(%rsi)
; SSE41-NEXT: movdqa %xmm6, 32(%rsi)
; SSE41-NEXT: movdqa %xmm5, 16(%rsi)
; SSE41-NEXT: movdqa %xmm4, (%rsi)
-; SSE41-NEXT: pmovsxbd %xmm9, %xmm3
-; SSE41-NEXT: movdqa %xmm3, 192(%rdi)
-; SSE41-NEXT: pmovsxbd %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, 128(%rdi)
-; SSE41-NEXT: pmovsxbd %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm3, 64(%rdi)
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm3, (%rdi)
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; SSE41-NEXT: pslld $31, %xmm3
-; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: movdqa %xmm3, 224(%rdi)
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,3,3,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; SSE41-NEXT: pslld $31, %xmm3
-; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: movdqa %xmm3, 240(%rdi)
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1]
+; SSE41-NEXT: pmovsxbd %xmm3, %xmm4
+; SSE41-NEXT: movdqa %xmm4, 192(%rdi)
+; SSE41-NEXT: pmovsxbd %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm4, 128(%rdi)
+; SSE41-NEXT: pmovsxbd %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm4, 64(%rdi)
+; SSE41-NEXT: pmovsxbd %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm4, (%rdi)
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: movdqa %xmm4, 224(%rdi)
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: movdqa %xmm4, 240(%rdi)
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm3
; SSE41-NEXT: psrad $31, %xmm3
@@ -2409,75 +2409,75 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: movq %rdi, %rax
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
-; AVX1-NEXT: vpmulhw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm8
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; AVX1-NEXT: vpmulhw %xmm6, %xmm8, %xmm8
+; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm6
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
-; AVX1-NEXT: vpmulhw %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm4
+; AVX1-NEXT: vpmulhw %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm7
+; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8
+; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm6, %xmm10, %xmm6
-; AVX1-NEXT: vpand %xmm7, %xmm10, %xmm7
-; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm9
-; AVX1-NEXT: vpcmpgtb %xmm9, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpeqb %xmm4, %xmm6, %xmm8
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; AVX1-NEXT: vpmulhw %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm6
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm8
+; AVX1-NEXT: vpcmpeqb %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; AVX1-NEXT: vpmulhw %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm9
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm3
-; AVX1-NEXT: vpackuswb %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm10, %xmm1
-; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm11
-; AVX1-NEXT: vpcmpgtb %xmm11, %xmm5, %xmm4
-; AVX1-NEXT: vpcmpeqb %xmm3, %xmm4, %xmm12
+; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8
+; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm8
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm8, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; AVX1-NEXT: vpmulhw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm4
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15]
+; AVX1-NEXT: vpmulhw %xmm9, %xmm11, %xmm9
+; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm11
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; AVX1-NEXT: vpmulhw %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm6
-; AVX1-NEXT: vpackuswb %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm10, %xmm1
-; AVX1-NEXT: vpand %xmm3, %xmm10, %xmm3
-; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtb %xmm3, %xmm5, %xmm1
-; AVX1-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; AVX1-NEXT: vpmulhw %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm6
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
+; AVX1-NEXT: vpmulhw %xmm3, %xmm10, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm10
+; AVX1-NEXT: vpackuswb %xmm11, %xmm10, %xmm10
+; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm9
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm5, %xmm9
+; AVX1-NEXT: vpcmpeqb %xmm10, %xmm9, %xmm10
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; AVX1-NEXT: vpmulhw %xmm9, %xmm11, %xmm9
+; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm11
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
-; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm4
-; AVX1-NEXT: vpand %xmm0, %xmm10, %xmm0
-; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm0
+; AVX1-NEXT: vpackuswb %xmm11, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm9
+; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm9, %xmm0, %xmm9
+; AVX1-NEXT: vpcmpgtb %xmm9, %xmm5, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm6
-; AVX1-NEXT: vpxor %xmm7, %xmm12, %xmm5
-; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm2
-; AVX1-NEXT: vpxor %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa %xmm9, 48(%rsi)
-; AVX1-NEXT: vmovdqa %xmm11, 32(%rsi)
+; AVX1-NEXT: vpcmpeqd %xmm11, %xmm11, %xmm11
+; AVX1-NEXT: vpxor %xmm7, %xmm11, %xmm6
+; AVX1-NEXT: vpxor %xmm11, %xmm8, %xmm5
+; AVX1-NEXT: vpxor %xmm11, %xmm10, %xmm2
+; AVX1-NEXT: vpxor %xmm0, %xmm11, %xmm0
+; AVX1-NEXT: vmovdqa %xmm4, 48(%rsi)
+; AVX1-NEXT: vmovdqa %xmm1, 32(%rsi)
; AVX1-NEXT: vmovdqa %xmm3, 16(%rsi)
-; AVX1-NEXT: vmovdqa %xmm4, (%rsi)
+; AVX1-NEXT: vmovdqa %xmm9, (%rsi)
; AVX1-NEXT: vpmovsxbd %xmm6, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 192(%rdi)
; AVX1-NEXT: vpmovsxbd %xmm5, %xmm1
@@ -2562,29 +2562,29 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX2-NEXT: vpmovsxbd %xmm4, %ymm8
+; AVX2-NEXT: vpmovsxbd %xmm4, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; AVX2-NEXT: vpmovsxbd %xmm6, %ymm9
+; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6
; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
-; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm8
+; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,3,2,3]
+; AVX2-NEXT: vpmovsxbd %xmm9, %ymm9
; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2
; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5
; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3
-; AVX2-NEXT: vpmovsxbd %xmm4, %ymm4
+; AVX2-NEXT: vpmovsxbd %xmm8, %ymm8
; AVX2-NEXT: vmovdqa %ymm1, 32(%rsi)
; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX2-NEXT: vmovdqa %ymm4, 192(%rdi)
+; AVX2-NEXT: vmovdqa %ymm8, 192(%rdi)
; AVX2-NEXT: vmovdqa %ymm3, 128(%rdi)
; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
-; AVX2-NEXT: vmovdqa %ymm6, 224(%rdi)
+; AVX2-NEXT: vmovdqa %ymm9, 224(%rdi)
; AVX2-NEXT: vmovdqa %ymm7, 160(%rdi)
-; AVX2-NEXT: vmovdqa %ymm9, 96(%rdi)
-; AVX2-NEXT: vmovdqa %ymm8, 32(%rdi)
+; AVX2-NEXT: vmovdqa %ymm6, 96(%rdi)
+; AVX2-NEXT: vmovdqa %ymm4, 32(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -2796,23 +2796,23 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; SSE2-LABEL: smulo_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE2-NEXT: movq %xmm2, %r8
+; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rcx
; SSE2-NEXT: movq %xmm1, %rdx
; SSE2-NEXT: movq %xmm0, %rsi
-; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: xorl %r8d, %r8d
; SSE2-NEXT: imulq %rdx, %rsi
-; SSE2-NEXT: movq $-1, %r9
-; SSE2-NEXT: movl $0, %edx
-; SSE2-NEXT: cmovoq %r9, %rdx
+; SSE2-NEXT: movq $-1, %rdx
+; SSE2-NEXT: movl $0, %r9d
+; SSE2-NEXT: cmovoq %rdx, %r9
; SSE2-NEXT: movq %rsi, %xmm1
-; SSE2-NEXT: imulq %r8, %rcx
+; SSE2-NEXT: imulq %rax, %rcx
; SSE2-NEXT: movq %rcx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movq %rdx, %xmm0
-; SSE2-NEXT: cmovoq %r9, %rax
-; SSE2-NEXT: movq %rax, %xmm2
+; SSE2-NEXT: movq %r9, %xmm0
+; SSE2-NEXT: cmovoq %rdx, %r8
+; SSE2-NEXT: movq %r8, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: movdqa %xmm1, (%rdi)
@@ -2821,23 +2821,23 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; SSSE3-LABEL: smulo_v2i64:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSSE3-NEXT: movq %xmm2, %r8
+; SSSE3-NEXT: movq %xmm2, %rax
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; SSSE3-NEXT: movq %xmm2, %rcx
; SSSE3-NEXT: movq %xmm1, %rdx
; SSSE3-NEXT: movq %xmm0, %rsi
-; SSSE3-NEXT: xorl %eax, %eax
+; SSSE3-NEXT: xorl %r8d, %r8d
; SSSE3-NEXT: imulq %rdx, %rsi
-; SSSE3-NEXT: movq $-1, %r9
-; SSSE3-NEXT: movl $0, %edx
-; SSSE3-NEXT: cmovoq %r9, %rdx
+; SSSE3-NEXT: movq $-1, %rdx
+; SSSE3-NEXT: movl $0, %r9d
+; SSSE3-NEXT: cmovoq %rdx, %r9
; SSSE3-NEXT: movq %rsi, %xmm1
-; SSSE3-NEXT: imulq %r8, %rcx
+; SSSE3-NEXT: imulq %rax, %rcx
; SSSE3-NEXT: movq %rcx, %xmm0
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: movq %rdx, %xmm0
-; SSSE3-NEXT: cmovoq %r9, %rax
-; SSSE3-NEXT: movq %rax, %xmm2
+; SSSE3-NEXT: movq %r9, %xmm0
+; SSSE3-NEXT: cmovoq %rdx, %r8
+; SSSE3-NEXT: movq %r8, %xmm2
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: movdqa %xmm1, (%rdi)
@@ -2845,22 +2845,22 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
;
; SSE41-LABEL: smulo_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq %xmm1, %r8
+; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: movq %xmm0, %rcx
; SSE41-NEXT: pextrq $1, %xmm1, %rdx
; SSE41-NEXT: pextrq $1, %xmm0, %rsi
-; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: xorl %r8d, %r8d
; SSE41-NEXT: imulq %rdx, %rsi
-; SSE41-NEXT: movq $-1, %r9
-; SSE41-NEXT: movl $0, %edx
-; SSE41-NEXT: cmovoq %r9, %rdx
+; SSE41-NEXT: movq $-1, %rdx
+; SSE41-NEXT: movl $0, %r9d
+; SSE41-NEXT: cmovoq %rdx, %r9
; SSE41-NEXT: movq %rsi, %xmm0
-; SSE41-NEXT: imulq %r8, %rcx
+; SSE41-NEXT: imulq %rax, %rcx
; SSE41-NEXT: movq %rcx, %xmm1
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE41-NEXT: movq %rdx, %xmm0
-; SSE41-NEXT: cmovoq %r9, %rax
-; SSE41-NEXT: movq %rax, %xmm2
+; SSE41-NEXT: movq %r9, %xmm0
+; SSE41-NEXT: cmovoq %rdx, %r8
+; SSE41-NEXT: movq %r8, %xmm2
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE41-NEXT: movdqa %xmm1, (%rdi)
@@ -2868,22 +2868,22 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
;
; AVX-LABEL: smulo_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovq %xmm1, %r8
+; AVX-NEXT: vmovq %xmm1, %rax
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: vpextrq $1, %xmm1, %rdx
; AVX-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: xorl %r8d, %r8d
; AVX-NEXT: imulq %rdx, %rsi
-; AVX-NEXT: movq $-1, %r9
-; AVX-NEXT: movl $0, %edx
-; AVX-NEXT: cmovoq %r9, %rdx
+; AVX-NEXT: movq $-1, %rdx
+; AVX-NEXT: movl $0, %r9d
+; AVX-NEXT: cmovoq %rdx, %r9
; AVX-NEXT: vmovq %rsi, %xmm0
-; AVX-NEXT: imulq %r8, %rcx
+; AVX-NEXT: imulq %rax, %rcx
; AVX-NEXT: vmovq %rcx, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX-NEXT: vmovq %rdx, %xmm0
-; AVX-NEXT: cmovoq %r9, %rax
-; AVX-NEXT: vmovq %rax, %xmm2
+; AVX-NEXT: vmovq %r9, %xmm0
+; AVX-NEXT: cmovoq %rdx, %r8
+; AVX-NEXT: vmovq %r8, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vmovdqa %xmm1, (%rdi)
@@ -3297,126 +3297,125 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: movq %r8, %r14
-; SSE2-NEXT: movq %rcx, %r11
-; SSE2-NEXT: movq %rdx, %r15
-; SSE2-NEXT: movq %rsi, %r13
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT: movq %rsi, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movq %r14, %rsi
-; SSE2-NEXT: imulq %rcx, %rsi
+; SSE2-NEXT: movq %rdx, %r8
+; SSE2-NEXT: movq %rsi, %r11
+; SSE2-NEXT: movq %rdi, %r10
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; SSE2-NEXT: movq %r11, %r12
+; SSE2-NEXT: sarq $63, %r12
+; SSE2-NEXT: movq %r14, %rbx
+; SSE2-NEXT: imulq %r12, %rbx
; SSE2-NEXT: movq %r14, %rax
-; SSE2-NEXT: mulq %rcx
-; SSE2-NEXT: movq %rax, %r10
-; SSE2-NEXT: addq %rsi, %rdx
-; SSE2-NEXT: imulq %r9, %rcx
-; SSE2-NEXT: addq %rdx, %rcx
+; SSE2-NEXT: mulq %r12
+; SSE2-NEXT: movq %rax, %rdi
+; SSE2-NEXT: addq %rbx, %rdx
+; SSE2-NEXT: imulq %r9, %r12
+; SSE2-NEXT: addq %rdx, %r12
; SSE2-NEXT: movq %r9, %rbx
; SSE2-NEXT: sarq $63, %rbx
-; SSE2-NEXT: movq %rbx, %rsi
-; SSE2-NEXT: imulq %r13, %rsi
+; SSE2-NEXT: movq %rbx, %r13
+; SSE2-NEXT: imulq %r11, %r13
; SSE2-NEXT: movq %rbx, %rax
-; SSE2-NEXT: mulq %rdi
-; SSE2-NEXT: movq %rax, %r12
-; SSE2-NEXT: addq %rsi, %rdx
-; SSE2-NEXT: imulq %rdi, %rbx
+; SSE2-NEXT: mulq %r10
+; SSE2-NEXT: movq %rax, %r15
+; SSE2-NEXT: addq %r13, %rdx
+; SSE2-NEXT: imulq %r10, %rbx
; SSE2-NEXT: addq %rdx, %rbx
-; SSE2-NEXT: addq %r10, %r12
-; SSE2-NEXT: adcq %rcx, %rbx
-; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: addq %rdi, %r15
+; SSE2-NEXT: adcq %r12, %rbx
+; SSE2-NEXT: movq %r10, %rax
; SSE2-NEXT: mulq %r14
-; SSE2-NEXT: movq %rdx, %rbp
-; SSE2-NEXT: movq %rax, %r10
-; SSE2-NEXT: movq %r13, %rax
+; SSE2-NEXT: movq %rdx, %r12
+; SSE2-NEXT: movq %rax, %rdi
+; SSE2-NEXT: movq %r11, %rax
; SSE2-NEXT: mulq %r14
-; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: addq %rbp, %rcx
-; SSE2-NEXT: adcq $0, %rsi
-; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: movq %rdx, %r14
+; SSE2-NEXT: movq %rax, %r13
+; SSE2-NEXT: addq %r12, %r13
+; SSE2-NEXT: adcq $0, %r14
+; SSE2-NEXT: movq %r10, %rax
; SSE2-NEXT: mulq %r9
-; SSE2-NEXT: movq %rdx, %rbp
-; SSE2-NEXT: movq %rax, %rdi
-; SSE2-NEXT: addq %rcx, %rdi
-; SSE2-NEXT: adcq %rsi, %rbp
+; SSE2-NEXT: movq %rdx, %r12
+; SSE2-NEXT: movq %rax, %r10
+; SSE2-NEXT: addq %r13, %r10
+; SSE2-NEXT: adcq %r14, %r12
; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %ecx
-; SSE2-NEXT: movq %r13, %rax
+; SSE2-NEXT: movzbl %al, %r14d
+; SSE2-NEXT: movq %r11, %rax
; SSE2-NEXT: mulq %r9
-; SSE2-NEXT: addq %rbp, %rax
-; SSE2-NEXT: adcq %rcx, %rdx
; SSE2-NEXT: addq %r12, %rax
+; SSE2-NEXT: adcq %r14, %rdx
+; SSE2-NEXT: addq %r15, %rax
; SSE2-NEXT: adcq %rbx, %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; SSE2-NEXT: movq %rdi, 8(%r13)
-; SSE2-NEXT: sarq $63, %rdi
-; SSE2-NEXT: xorq %rdi, %rdx
-; SSE2-NEXT: xorq %rax, %rdi
-; SSE2-NEXT: xorl %r12d, %r12d
-; SSE2-NEXT: orq %rdx, %rdi
-; SSE2-NEXT: setne %r12b
-; SSE2-NEXT: movq %r11, %rdi
-; SSE2-NEXT: sarq $63, %rdi
-; SSE2-NEXT: movq %r8, %rax
-; SSE2-NEXT: movq %r8, %rsi
-; SSE2-NEXT: imulq %rdi, %rsi
-; SSE2-NEXT: movq %r8, %rbx
-; SSE2-NEXT: mulq %rdi
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: addq %rsi, %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT: imulq %r8, %rdi
-; SSE2-NEXT: addq %rdx, %rdi
-; SSE2-NEXT: movq %r8, %rsi
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: movq %rsi, %rbp
-; SSE2-NEXT: imulq %r11, %rbp
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE2-NEXT: movq %r10, 8(%r12)
+; SSE2-NEXT: sarq $63, %r10
+; SSE2-NEXT: xorq %r10, %rdx
+; SSE2-NEXT: xorq %rax, %r10
+; SSE2-NEXT: xorl %r15d, %r15d
+; SSE2-NEXT: orq %rdx, %r10
+; SSE2-NEXT: setne %r15b
+; SSE2-NEXT: movq %rcx, %rbx
+; SSE2-NEXT: sarq $63, %rbx
+; SSE2-NEXT: movq %rsi, %r10
+; SSE2-NEXT: imulq %rbx, %r10
; SSE2-NEXT: movq %rsi, %rax
-; SSE2-NEXT: mulq %r15
-; SSE2-NEXT: movq %rax, %r14
-; SSE2-NEXT: addq %rbp, %rdx
-; SSE2-NEXT: imulq %r15, %rsi
-; SSE2-NEXT: addq %rdx, %rsi
-; SSE2-NEXT: addq %rcx, %r14
-; SSE2-NEXT: adcq %rdi, %rsi
-; SSE2-NEXT: movq %r15, %rax
; SSE2-NEXT: mulq %rbx
-; SSE2-NEXT: movq %rdx, %rcx
; SSE2-NEXT: movq %rax, %r9
-; SSE2-NEXT: movq %r11, %rax
-; SSE2-NEXT: mulq %rbx
-; SSE2-NEXT: movq %rdx, %rbx
-; SSE2-NEXT: movq %rax, %rbp
-; SSE2-NEXT: addq %rcx, %rbp
-; SSE2-NEXT: adcq $0, %rbx
-; SSE2-NEXT: movq %r15, %rax
+; SSE2-NEXT: addq %r10, %rdx
+; SSE2-NEXT: imulq %rbp, %rbx
+; SSE2-NEXT: addq %rdx, %rbx
+; SSE2-NEXT: movq %rbp, %r10
+; SSE2-NEXT: sarq $63, %r10
+; SSE2-NEXT: movq %r10, %r14
+; SSE2-NEXT: imulq %rcx, %r14
+; SSE2-NEXT: movq %r10, %rax
; SSE2-NEXT: mulq %r8
-; SSE2-NEXT: movq %rdx, %rcx
-; SSE2-NEXT: movq %rax, %rdi
-; SSE2-NEXT: addq %rbp, %rdi
-; SSE2-NEXT: adcq %rbx, %rcx
+; SSE2-NEXT: movq %rax, %r11
+; SSE2-NEXT: addq %r14, %rdx
+; SSE2-NEXT: imulq %r8, %r10
+; SSE2-NEXT: addq %rdx, %r10
+; SSE2-NEXT: addq %r9, %r11
+; SSE2-NEXT: adcq %rbx, %r10
+; SSE2-NEXT: movq %r8, %rax
+; SSE2-NEXT: mulq %rsi
+; SSE2-NEXT: movq %rdx, %r9
+; SSE2-NEXT: movq %rax, %rbx
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rsi
+; SSE2-NEXT: movq %rdx, %rsi
+; SSE2-NEXT: movq %rax, %r14
+; SSE2-NEXT: addq %r9, %r14
+; SSE2-NEXT: adcq $0, %rsi
+; SSE2-NEXT: movq %r8, %rax
+; SSE2-NEXT: mulq %rbp
+; SSE2-NEXT: movq %rdx, %r8
+; SSE2-NEXT: movq %rax, %r9
+; SSE2-NEXT: addq %r14, %r9
+; SSE2-NEXT: adcq %rsi, %r8
; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %ebp
-; SSE2-NEXT: movq %r11, %rax
-; SSE2-NEXT: mulq %r8
-; SSE2-NEXT: addq %rcx, %rax
-; SSE2-NEXT: adcq %rbp, %rdx
-; SSE2-NEXT: addq %r14, %rax
+; SSE2-NEXT: movzbl %al, %esi
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rbp
+; SSE2-NEXT: addq %r8, %rax
; SSE2-NEXT: adcq %rsi, %rdx
-; SSE2-NEXT: movq %rdi, 24(%r13)
-; SSE2-NEXT: sarq $63, %rdi
-; SSE2-NEXT: xorq %rdi, %rdx
-; SSE2-NEXT: xorq %rax, %rdi
+; SSE2-NEXT: addq %r11, %rax
+; SSE2-NEXT: adcq %r10, %rdx
+; SSE2-NEXT: movq %r9, 24(%r12)
+; SSE2-NEXT: sarq $63, %r9
+; SSE2-NEXT: xorq %r9, %rdx
+; SSE2-NEXT: xorq %rax, %r9
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rdx, %rdi
+; SSE2-NEXT: orq %rdx, %r9
; SSE2-NEXT: setne %al
; SSE2-NEXT: negl %eax
; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: negl %r12d
-; SSE2-NEXT: movd %r12d, %xmm0
+; SSE2-NEXT: negl %r15d
+; SSE2-NEXT: movd %r15d, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %r9, 16(%r13)
-; SSE2-NEXT: movq %r10, (%r13)
+; SSE2-NEXT: movq %rbx, 16(%r12)
+; SSE2-NEXT: movq %rdi, (%r12)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
@@ -3434,126 +3433,125 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
; SSSE3-NEXT: movq %r8, %r14
-; SSSE3-NEXT: movq %rcx, %r11
-; SSSE3-NEXT: movq %rdx, %r15
-; SSSE3-NEXT: movq %rsi, %r13
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSSE3-NEXT: movq %rsi, %rcx
-; SSSE3-NEXT: sarq $63, %rcx
-; SSSE3-NEXT: movq %r14, %rsi
-; SSSE3-NEXT: imulq %rcx, %rsi
+; SSSE3-NEXT: movq %rdx, %r8
+; SSSE3-NEXT: movq %rsi, %r11
+; SSSE3-NEXT: movq %rdi, %r10
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; SSSE3-NEXT: movq %r11, %r12
+; SSSE3-NEXT: sarq $63, %r12
+; SSSE3-NEXT: movq %r14, %rbx
+; SSSE3-NEXT: imulq %r12, %rbx
; SSSE3-NEXT: movq %r14, %rax
-; SSSE3-NEXT: mulq %rcx
-; SSSE3-NEXT: movq %rax, %r10
-; SSSE3-NEXT: addq %rsi, %rdx
-; SSSE3-NEXT: imulq %r9, %rcx
-; SSSE3-NEXT: addq %rdx, %rcx
+; SSSE3-NEXT: mulq %r12
+; SSSE3-NEXT: movq %rax, %rdi
+; SSSE3-NEXT: addq %rbx, %rdx
+; SSSE3-NEXT: imulq %r9, %r12
+; SSSE3-NEXT: addq %rdx, %r12
; SSSE3-NEXT: movq %r9, %rbx
; SSSE3-NEXT: sarq $63, %rbx
-; SSSE3-NEXT: movq %rbx, %rsi
-; SSSE3-NEXT: imulq %r13, %rsi
+; SSSE3-NEXT: movq %rbx, %r13
+; SSSE3-NEXT: imulq %r11, %r13
; SSSE3-NEXT: movq %rbx, %rax
-; SSSE3-NEXT: mulq %rdi
-; SSSE3-NEXT: movq %rax, %r12
-; SSSE3-NEXT: addq %rsi, %rdx
-; SSSE3-NEXT: imulq %rdi, %rbx
+; SSSE3-NEXT: mulq %r10
+; SSSE3-NEXT: movq %rax, %r15
+; SSSE3-NEXT: addq %r13, %rdx
+; SSSE3-NEXT: imulq %r10, %rbx
; SSSE3-NEXT: addq %rdx, %rbx
-; SSSE3-NEXT: addq %r10, %r12
-; SSSE3-NEXT: adcq %rcx, %rbx
-; SSSE3-NEXT: movq %rdi, %rax
+; SSSE3-NEXT: addq %rdi, %r15
+; SSSE3-NEXT: adcq %r12, %rbx
+; SSSE3-NEXT: movq %r10, %rax
; SSSE3-NEXT: mulq %r14
-; SSSE3-NEXT: movq %rdx, %rbp
-; SSSE3-NEXT: movq %rax, %r10
-; SSSE3-NEXT: movq %r13, %rax
+; SSSE3-NEXT: movq %rdx, %r12
+; SSSE3-NEXT: movq %rax, %rdi
+; SSSE3-NEXT: movq %r11, %rax
; SSSE3-NEXT: mulq %r14
-; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: addq %rbp, %rcx
-; SSSE3-NEXT: adcq $0, %rsi
-; SSSE3-NEXT: movq %rdi, %rax
+; SSSE3-NEXT: movq %rdx, %r14
+; SSSE3-NEXT: movq %rax, %r13
+; SSSE3-NEXT: addq %r12, %r13
+; SSSE3-NEXT: adcq $0, %r14
+; SSSE3-NEXT: movq %r10, %rax
; SSSE3-NEXT: mulq %r9
-; SSSE3-NEXT: movq %rdx, %rbp
-; SSSE3-NEXT: movq %rax, %rdi
-; SSSE3-NEXT: addq %rcx, %rdi
-; SSSE3-NEXT: adcq %rsi, %rbp
+; SSSE3-NEXT: movq %rdx, %r12
+; SSSE3-NEXT: movq %rax, %r10
+; SSSE3-NEXT: addq %r13, %r10
+; SSSE3-NEXT: adcq %r14, %r12
; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %ecx
-; SSSE3-NEXT: movq %r13, %rax
+; SSSE3-NEXT: movzbl %al, %r14d
+; SSSE3-NEXT: movq %r11, %rax
; SSSE3-NEXT: mulq %r9
-; SSSE3-NEXT: addq %rbp, %rax
-; SSSE3-NEXT: adcq %rcx, %rdx
; SSSE3-NEXT: addq %r12, %rax
+; SSSE3-NEXT: adcq %r14, %rdx
+; SSSE3-NEXT: addq %r15, %rax
; SSSE3-NEXT: adcq %rbx, %rdx
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; SSSE3-NEXT: movq %rdi, 8(%r13)
-; SSSE3-NEXT: sarq $63, %rdi
-; SSSE3-NEXT: xorq %rdi, %rdx
-; SSSE3-NEXT: xorq %rax, %rdi
-; SSSE3-NEXT: xorl %r12d, %r12d
-; SSSE3-NEXT: orq %rdx, %rdi
-; SSSE3-NEXT: setne %r12b
-; SSSE3-NEXT: movq %r11, %rdi
-; SSSE3-NEXT: sarq $63, %rdi
-; SSSE3-NEXT: movq %r8, %rax
-; SSSE3-NEXT: movq %r8, %rsi
-; SSSE3-NEXT: imulq %rdi, %rsi
-; SSSE3-NEXT: movq %r8, %rbx
-; SSSE3-NEXT: mulq %rdi
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: addq %rsi, %rdx
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSSE3-NEXT: imulq %r8, %rdi
-; SSSE3-NEXT: addq %rdx, %rdi
-; SSSE3-NEXT: movq %r8, %rsi
-; SSSE3-NEXT: sarq $63, %rsi
-; SSSE3-NEXT: movq %rsi, %rbp
-; SSSE3-NEXT: imulq %r11, %rbp
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSSE3-NEXT: movq %r10, 8(%r12)
+; SSSE3-NEXT: sarq $63, %r10
+; SSSE3-NEXT: xorq %r10, %rdx
+; SSSE3-NEXT: xorq %rax, %r10
+; SSSE3-NEXT: xorl %r15d, %r15d
+; SSSE3-NEXT: orq %rdx, %r10
+; SSSE3-NEXT: setne %r15b
+; SSSE3-NEXT: movq %rcx, %rbx
+; SSSE3-NEXT: sarq $63, %rbx
+; SSSE3-NEXT: movq %rsi, %r10
+; SSSE3-NEXT: imulq %rbx, %r10
; SSSE3-NEXT: movq %rsi, %rax
-; SSSE3-NEXT: mulq %r15
-; SSSE3-NEXT: movq %rax, %r14
-; SSSE3-NEXT: addq %rbp, %rdx
-; SSSE3-NEXT: imulq %r15, %rsi
-; SSSE3-NEXT: addq %rdx, %rsi
-; SSSE3-NEXT: addq %rcx, %r14
-; SSSE3-NEXT: adcq %rdi, %rsi
-; SSSE3-NEXT: movq %r15, %rax
; SSSE3-NEXT: mulq %rbx
-; SSSE3-NEXT: movq %rdx, %rcx
; SSSE3-NEXT: movq %rax, %r9
-; SSSE3-NEXT: movq %r11, %rax
-; SSSE3-NEXT: mulq %rbx
-; SSSE3-NEXT: movq %rdx, %rbx
-; SSSE3-NEXT: movq %rax, %rbp
-; SSSE3-NEXT: addq %rcx, %rbp
-; SSSE3-NEXT: adcq $0, %rbx
-; SSSE3-NEXT: movq %r15, %rax
+; SSSE3-NEXT: addq %r10, %rdx
+; SSSE3-NEXT: imulq %rbp, %rbx
+; SSSE3-NEXT: addq %rdx, %rbx
+; SSSE3-NEXT: movq %rbp, %r10
+; SSSE3-NEXT: sarq $63, %r10
+; SSSE3-NEXT: movq %r10, %r14
+; SSSE3-NEXT: imulq %rcx, %r14
+; SSSE3-NEXT: movq %r10, %rax
; SSSE3-NEXT: mulq %r8
-; SSSE3-NEXT: movq %rdx, %rcx
-; SSSE3-NEXT: movq %rax, %rdi
-; SSSE3-NEXT: addq %rbp, %rdi
-; SSSE3-NEXT: adcq %rbx, %rcx
+; SSSE3-NEXT: movq %rax, %r11
+; SSSE3-NEXT: addq %r14, %rdx
+; SSSE3-NEXT: imulq %r8, %r10
+; SSSE3-NEXT: addq %rdx, %r10
+; SSSE3-NEXT: addq %r9, %r11
+; SSSE3-NEXT: adcq %rbx, %r10
+; SSSE3-NEXT: movq %r8, %rax
+; SSSE3-NEXT: mulq %rsi
+; SSSE3-NEXT: movq %rdx, %r9
+; SSSE3-NEXT: movq %rax, %rbx
+; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: mulq %rsi
+; SSSE3-NEXT: movq %rdx, %rsi
+; SSSE3-NEXT: movq %rax, %r14
+; SSSE3-NEXT: addq %r9, %r14
+; SSSE3-NEXT: adcq $0, %rsi
+; SSSE3-NEXT: movq %r8, %rax
+; SSSE3-NEXT: mulq %rbp
+; SSSE3-NEXT: movq %rdx, %r8
+; SSSE3-NEXT: movq %rax, %r9
+; SSSE3-NEXT: addq %r14, %r9
+; SSSE3-NEXT: adcq %rsi, %r8
; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %ebp
-; SSSE3-NEXT: movq %r11, %rax
-; SSSE3-NEXT: mulq %r8
-; SSSE3-NEXT: addq %rcx, %rax
-; SSSE3-NEXT: adcq %rbp, %rdx
-; SSSE3-NEXT: addq %r14, %rax
+; SSSE3-NEXT: movzbl %al, %esi
+; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: mulq %rbp
+; SSSE3-NEXT: addq %r8, %rax
; SSSE3-NEXT: adcq %rsi, %rdx
-; SSSE3-NEXT: movq %rdi, 24(%r13)
-; SSSE3-NEXT: sarq $63, %rdi
-; SSSE3-NEXT: xorq %rdi, %rdx
-; SSSE3-NEXT: xorq %rax, %rdi
+; SSSE3-NEXT: addq %r11, %rax
+; SSSE3-NEXT: adcq %r10, %rdx
+; SSSE3-NEXT: movq %r9, 24(%r12)
+; SSSE3-NEXT: sarq $63, %r9
+; SSSE3-NEXT: xorq %r9, %rdx
+; SSSE3-NEXT: xorq %rax, %r9
; SSSE3-NEXT: xorl %eax, %eax
-; SSSE3-NEXT: orq %rdx, %rdi
+; SSSE3-NEXT: orq %rdx, %r9
; SSSE3-NEXT: setne %al
; SSSE3-NEXT: negl %eax
; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: negl %r12d
-; SSSE3-NEXT: movd %r12d, %xmm0
+; SSSE3-NEXT: negl %r15d
+; SSSE3-NEXT: movd %r15d, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %r9, 16(%r13)
-; SSSE3-NEXT: movq %r10, (%r13)
+; SSSE3-NEXT: movq %rbx, 16(%r12)
+; SSSE3-NEXT: movq %rdi, (%r12)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r13
@@ -3571,125 +3569,124 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
; SSE41-NEXT: movq %r8, %r14
-; SSE41-NEXT: movq %rcx, %r11
-; SSE41-NEXT: movq %rdx, %r15
-; SSE41-NEXT: movq %rsi, %r13
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE41-NEXT: movq %rsi, %rcx
-; SSE41-NEXT: sarq $63, %rcx
-; SSE41-NEXT: movq %r14, %rsi
-; SSE41-NEXT: imulq %rcx, %rsi
+; SSE41-NEXT: movq %rdx, %r8
+; SSE41-NEXT: movq %rsi, %r11
+; SSE41-NEXT: movq %rdi, %r10
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; SSE41-NEXT: movq %r11, %r12
+; SSE41-NEXT: sarq $63, %r12
+; SSE41-NEXT: movq %r14, %rbx
+; SSE41-NEXT: imulq %r12, %rbx
; SSE41-NEXT: movq %r14, %rax
-; SSE41-NEXT: mulq %rcx
-; SSE41-NEXT: movq %rax, %r10
-; SSE41-NEXT: addq %rsi, %rdx
-; SSE41-NEXT: imulq %r9, %rcx
-; SSE41-NEXT: addq %rdx, %rcx
+; SSE41-NEXT: mulq %r12
+; SSE41-NEXT: movq %rax, %rdi
+; SSE41-NEXT: addq %rbx, %rdx
+; SSE41-NEXT: imulq %r9, %r12
+; SSE41-NEXT: addq %rdx, %r12
; SSE41-NEXT: movq %r9, %rbx
; SSE41-NEXT: sarq $63, %rbx
-; SSE41-NEXT: movq %rbx, %rsi
-; SSE41-NEXT: imulq %r13, %rsi
+; SSE41-NEXT: movq %rbx, %r13
+; SSE41-NEXT: imulq %r11, %r13
; SSE41-NEXT: movq %rbx, %rax
-; SSE41-NEXT: mulq %rdi
-; SSE41-NEXT: movq %rax, %r12
-; SSE41-NEXT: addq %rsi, %rdx
-; SSE41-NEXT: imulq %rdi, %rbx
+; SSE41-NEXT: mulq %r10
+; SSE41-NEXT: movq %rax, %r15
+; SSE41-NEXT: addq %r13, %rdx
+; SSE41-NEXT: imulq %r10, %rbx
; SSE41-NEXT: addq %rdx, %rbx
-; SSE41-NEXT: addq %r10, %r12
-; SSE41-NEXT: adcq %rcx, %rbx
-; SSE41-NEXT: movq %rdi, %rax
+; SSE41-NEXT: addq %rdi, %r15
+; SSE41-NEXT: adcq %r12, %rbx
+; SSE41-NEXT: movq %r10, %rax
; SSE41-NEXT: mulq %r14
-; SSE41-NEXT: movq %rdx, %rbp
-; SSE41-NEXT: movq %rax, %r10
-; SSE41-NEXT: movq %r13, %rax
+; SSE41-NEXT: movq %rdx, %r12
+; SSE41-NEXT: movq %rax, %rdi
+; SSE41-NEXT: movq %r11, %rax
; SSE41-NEXT: mulq %r14
-; SSE41-NEXT: movq %rdx, %rsi
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: addq %rbp, %rcx
-; SSE41-NEXT: adcq $0, %rsi
-; SSE41-NEXT: movq %rdi, %rax
+; SSE41-NEXT: movq %rdx, %r14
+; SSE41-NEXT: movq %rax, %r13
+; SSE41-NEXT: addq %r12, %r13
+; SSE41-NEXT: adcq $0, %r14
+; SSE41-NEXT: movq %r10, %rax
; SSE41-NEXT: mulq %r9
-; SSE41-NEXT: movq %rdx, %rbp
-; SSE41-NEXT: movq %rax, %rdi
-; SSE41-NEXT: addq %rcx, %rdi
-; SSE41-NEXT: adcq %rsi, %rbp
+; SSE41-NEXT: movq %rdx, %r12
+; SSE41-NEXT: movq %rax, %r10
+; SSE41-NEXT: addq %r13, %r10
+; SSE41-NEXT: adcq %r14, %r12
; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %ecx
-; SSE41-NEXT: movq %r13, %rax
+; SSE41-NEXT: movzbl %al, %r14d
+; SSE41-NEXT: movq %r11, %rax
; SSE41-NEXT: mulq %r9
-; SSE41-NEXT: addq %rbp, %rax
-; SSE41-NEXT: adcq %rcx, %rdx
; SSE41-NEXT: addq %r12, %rax
+; SSE41-NEXT: adcq %r14, %rdx
+; SSE41-NEXT: addq %r15, %rax
; SSE41-NEXT: adcq %rbx, %rdx
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; SSE41-NEXT: movq %rdi, 8(%r13)
-; SSE41-NEXT: sarq $63, %rdi
-; SSE41-NEXT: xorq %rdi, %rdx
-; SSE41-NEXT: xorq %rax, %rdi
-; SSE41-NEXT: xorl %r12d, %r12d
-; SSE41-NEXT: orq %rdx, %rdi
-; SSE41-NEXT: setne %r12b
-; SSE41-NEXT: movq %r11, %rdi
-; SSE41-NEXT: sarq $63, %rdi
-; SSE41-NEXT: movq %r8, %rax
-; SSE41-NEXT: movq %r8, %rsi
-; SSE41-NEXT: imulq %rdi, %rsi
-; SSE41-NEXT: movq %r8, %rbx
-; SSE41-NEXT: mulq %rdi
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: addq %rsi, %rdx
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE41-NEXT: imulq %r8, %rdi
-; SSE41-NEXT: addq %rdx, %rdi
-; SSE41-NEXT: movq %r8, %rsi
-; SSE41-NEXT: sarq $63, %rsi
-; SSE41-NEXT: movq %rsi, %rbp
-; SSE41-NEXT: imulq %r11, %rbp
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE41-NEXT: movq %r10, 8(%r12)
+; SSE41-NEXT: sarq $63, %r10
+; SSE41-NEXT: xorq %r10, %rdx
+; SSE41-NEXT: xorq %rax, %r10
+; SSE41-NEXT: xorl %r15d, %r15d
+; SSE41-NEXT: orq %rdx, %r10
+; SSE41-NEXT: setne %r15b
+; SSE41-NEXT: movq %rcx, %rbx
+; SSE41-NEXT: sarq $63, %rbx
+; SSE41-NEXT: movq %rsi, %r10
+; SSE41-NEXT: imulq %rbx, %r10
; SSE41-NEXT: movq %rsi, %rax
-; SSE41-NEXT: mulq %r15
-; SSE41-NEXT: movq %rax, %r14
-; SSE41-NEXT: addq %rbp, %rdx
-; SSE41-NEXT: imulq %r15, %rsi
-; SSE41-NEXT: addq %rdx, %rsi
-; SSE41-NEXT: addq %rcx, %r14
-; SSE41-NEXT: adcq %rdi, %rsi
-; SSE41-NEXT: movq %r15, %rax
; SSE41-NEXT: mulq %rbx
-; SSE41-NEXT: movq %rdx, %rcx
; SSE41-NEXT: movq %rax, %r9
-; SSE41-NEXT: movq %r11, %rax
-; SSE41-NEXT: mulq %rbx
-; SSE41-NEXT: movq %rdx, %rbx
-; SSE41-NEXT: movq %rax, %rbp
-; SSE41-NEXT: addq %rcx, %rbp
-; SSE41-NEXT: adcq $0, %rbx
-; SSE41-NEXT: movq %r15, %rax
+; SSE41-NEXT: addq %r10, %rdx
+; SSE41-NEXT: imulq %rbp, %rbx
+; SSE41-NEXT: addq %rdx, %rbx
+; SSE41-NEXT: movq %rbp, %r10
+; SSE41-NEXT: sarq $63, %r10
+; SSE41-NEXT: movq %r10, %r14
+; SSE41-NEXT: imulq %rcx, %r14
+; SSE41-NEXT: movq %r10, %rax
; SSE41-NEXT: mulq %r8
-; SSE41-NEXT: movq %rdx, %rcx
-; SSE41-NEXT: movq %rax, %rdi
-; SSE41-NEXT: addq %rbp, %rdi
-; SSE41-NEXT: adcq %rbx, %rcx
+; SSE41-NEXT: movq %rax, %r11
+; SSE41-NEXT: addq %r14, %rdx
+; SSE41-NEXT: imulq %r8, %r10
+; SSE41-NEXT: addq %rdx, %r10
+; SSE41-NEXT: addq %r9, %r11
+; SSE41-NEXT: adcq %rbx, %r10
+; SSE41-NEXT: movq %r8, %rax
+; SSE41-NEXT: mulq %rsi
+; SSE41-NEXT: movq %rdx, %r9
+; SSE41-NEXT: movq %rax, %rbx
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rsi
+; SSE41-NEXT: movq %rdx, %rsi
+; SSE41-NEXT: movq %rax, %r14
+; SSE41-NEXT: addq %r9, %r14
+; SSE41-NEXT: adcq $0, %rsi
+; SSE41-NEXT: movq %r8, %rax
+; SSE41-NEXT: mulq %rbp
+; SSE41-NEXT: movq %rdx, %r8
+; SSE41-NEXT: movq %rax, %r9
+; SSE41-NEXT: addq %r14, %r9
+; SSE41-NEXT: adcq %rsi, %r8
; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %ebp
-; SSE41-NEXT: movq %r11, %rax
-; SSE41-NEXT: mulq %r8
-; SSE41-NEXT: addq %rcx, %rax
-; SSE41-NEXT: adcq %rbp, %rdx
-; SSE41-NEXT: addq %r14, %rax
+; SSE41-NEXT: movzbl %al, %esi
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rbp
+; SSE41-NEXT: addq %r8, %rax
; SSE41-NEXT: adcq %rsi, %rdx
-; SSE41-NEXT: movq %rdi, 24(%r13)
-; SSE41-NEXT: sarq $63, %rdi
-; SSE41-NEXT: xorq %rdi, %rdx
-; SSE41-NEXT: xorq %rax, %rdi
+; SSE41-NEXT: addq %r11, %rax
+; SSE41-NEXT: adcq %r10, %rdx
+; SSE41-NEXT: movq %r9, 24(%r12)
+; SSE41-NEXT: sarq $63, %r9
+; SSE41-NEXT: xorq %r9, %rdx
+; SSE41-NEXT: xorq %rax, %r9
; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: orq %rdx, %rdi
+; SSE41-NEXT: orq %rdx, %r9
; SSE41-NEXT: setne %al
; SSE41-NEXT: negl %eax
-; SSE41-NEXT: negl %r12d
-; SSE41-NEXT: movd %r12d, %xmm0
+; SSE41-NEXT: negl %r15d
+; SSE41-NEXT: movd %r15d, %xmm0
; SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; SSE41-NEXT: movq %r9, 16(%r13)
-; SSE41-NEXT: movq %r10, (%r13)
+; SSE41-NEXT: movq %rbx, 16(%r12)
+; SSE41-NEXT: movq %rdi, (%r12)
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
; SSE41-NEXT: popq %r13
@@ -3707,125 +3704,124 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: pushq %r12
; AVX-NEXT: pushq %rbx
; AVX-NEXT: movq %r8, %r14
-; AVX-NEXT: movq %rcx, %r11
-; AVX-NEXT: movq %rdx, %r15
-; AVX-NEXT: movq %rsi, %r13
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX-NEXT: movq %rsi, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: movq %r14, %rsi
-; AVX-NEXT: imulq %rcx, %rsi
+; AVX-NEXT: movq %rdx, %r8
+; AVX-NEXT: movq %rsi, %r11
+; AVX-NEXT: movq %rdi, %r10
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; AVX-NEXT: movq %r11, %r12
+; AVX-NEXT: sarq $63, %r12
+; AVX-NEXT: movq %r14, %rbx
+; AVX-NEXT: imulq %r12, %rbx
; AVX-NEXT: movq %r14, %rax
-; AVX-NEXT: mulq %rcx
-; AVX-NEXT: movq %rax, %r10
-; AVX-NEXT: addq %rsi, %rdx
-; AVX-NEXT: imulq %r9, %rcx
-; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: mulq %r12
+; AVX-NEXT: movq %rax, %rdi
+; AVX-NEXT: addq %rbx, %rdx
+; AVX-NEXT: imulq %r9, %r12
+; AVX-NEXT: addq %rdx, %r12
; AVX-NEXT: movq %r9, %rbx
; AVX-NEXT: sarq $63, %rbx
-; AVX-NEXT: movq %rbx, %rsi
-; AVX-NEXT: imulq %r13, %rsi
+; AVX-NEXT: movq %rbx, %r13
+; AVX-NEXT: imulq %r11, %r13
; AVX-NEXT: movq %rbx, %rax
-; AVX-NEXT: mulq %rdi
-; AVX-NEXT: movq %rax, %r12
-; AVX-NEXT: addq %rsi, %rdx
-; AVX-NEXT: imulq %rdi, %rbx
+; AVX-NEXT: mulq %r10
+; AVX-NEXT: movq %rax, %r15
+; AVX-NEXT: addq %r13, %rdx
+; AVX-NEXT: imulq %r10, %rbx
; AVX-NEXT: addq %rdx, %rbx
-; AVX-NEXT: addq %r10, %r12
-; AVX-NEXT: adcq %rcx, %rbx
-; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: addq %rdi, %r15
+; AVX-NEXT: adcq %r12, %rbx
+; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %r14
-; AVX-NEXT: movq %rdx, %rbp
-; AVX-NEXT: movq %rax, %r10
-; AVX-NEXT: movq %r13, %rax
+; AVX-NEXT: movq %rdx, %r12
+; AVX-NEXT: movq %rax, %rdi
+; AVX-NEXT: movq %r11, %rax
; AVX-NEXT: mulq %r14
-; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: addq %rbp, %rcx
-; AVX-NEXT: adcq $0, %rsi
-; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: movq %rdx, %r14
+; AVX-NEXT: movq %rax, %r13
+; AVX-NEXT: addq %r12, %r13
+; AVX-NEXT: adcq $0, %r14
+; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %r9
-; AVX-NEXT: movq %rdx, %rbp
-; AVX-NEXT: movq %rax, %rdi
-; AVX-NEXT: addq %rcx, %rdi
-; AVX-NEXT: adcq %rsi, %rbp
+; AVX-NEXT: movq %rdx, %r12
+; AVX-NEXT: movq %rax, %r10
+; AVX-NEXT: addq %r13, %r10
+; AVX-NEXT: adcq %r14, %r12
; AVX-NEXT: setb %al
-; AVX-NEXT: movzbl %al, %ecx
-; AVX-NEXT: movq %r13, %rax
+; AVX-NEXT: movzbl %al, %r14d
+; AVX-NEXT: movq %r11, %rax
; AVX-NEXT: mulq %r9
-; AVX-NEXT: addq %rbp, %rax
-; AVX-NEXT: adcq %rcx, %rdx
; AVX-NEXT: addq %r12, %rax
+; AVX-NEXT: adcq %r14, %rdx
+; AVX-NEXT: addq %r15, %rax
; AVX-NEXT: adcq %rbx, %rdx
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX-NEXT: movq %rdi, 8(%r13)
-; AVX-NEXT: sarq $63, %rdi
-; AVX-NEXT: xorq %rdi, %rdx
-; AVX-NEXT: xorq %rax, %rdi
-; AVX-NEXT: xorl %r12d, %r12d
-; AVX-NEXT: orq %rdx, %rdi
-; AVX-NEXT: setne %r12b
-; AVX-NEXT: movq %r11, %rdi
-; AVX-NEXT: sarq $63, %rdi
-; AVX-NEXT: movq %r8, %rax
-; AVX-NEXT: movq %r8, %rsi
-; AVX-NEXT: imulq %rdi, %rsi
-; AVX-NEXT: movq %r8, %rbx
-; AVX-NEXT: mulq %rdi
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: addq %rsi, %rdx
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX-NEXT: imulq %r8, %rdi
-; AVX-NEXT: addq %rdx, %rdi
-; AVX-NEXT: movq %r8, %rsi
-; AVX-NEXT: sarq $63, %rsi
-; AVX-NEXT: movq %rsi, %rbp
-; AVX-NEXT: imulq %r11, %rbp
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX-NEXT: movq %r10, 8(%r12)
+; AVX-NEXT: sarq $63, %r10
+; AVX-NEXT: xorq %r10, %rdx
+; AVX-NEXT: xorq %rax, %r10
+; AVX-NEXT: xorl %r15d, %r15d
+; AVX-NEXT: orq %rdx, %r10
+; AVX-NEXT: setne %r15b
+; AVX-NEXT: movq %rcx, %rbx
+; AVX-NEXT: sarq $63, %rbx
+; AVX-NEXT: movq %rsi, %r10
+; AVX-NEXT: imulq %rbx, %r10
; AVX-NEXT: movq %rsi, %rax
-; AVX-NEXT: mulq %r15
-; AVX-NEXT: movq %rax, %r14
-; AVX-NEXT: addq %rbp, %rdx
-; AVX-NEXT: imulq %r15, %rsi
-; AVX-NEXT: addq %rdx, %rsi
-; AVX-NEXT: addq %rcx, %r14
-; AVX-NEXT: adcq %rdi, %rsi
-; AVX-NEXT: movq %r15, %rax
; AVX-NEXT: mulq %rbx
-; AVX-NEXT: movq %rdx, %rcx
; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: movq %r11, %rax
-; AVX-NEXT: mulq %rbx
-; AVX-NEXT: movq %rdx, %rbx
-; AVX-NEXT: movq %rax, %rbp
-; AVX-NEXT: addq %rcx, %rbp
-; AVX-NEXT: adcq $0, %rbx
-; AVX-NEXT: movq %r15, %rax
+; AVX-NEXT: addq %r10, %rdx
+; AVX-NEXT: imulq %rbp, %rbx
+; AVX-NEXT: addq %rdx, %rbx
+; AVX-NEXT: movq %rbp, %r10
+; AVX-NEXT: sarq $63, %r10
+; AVX-NEXT: movq %r10, %r14
+; AVX-NEXT: imulq %rcx, %r14
+; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %r8
-; AVX-NEXT: movq %rdx, %rcx
-; AVX-NEXT: movq %rax, %rdi
-; AVX-NEXT: addq %rbp, %rdi
-; AVX-NEXT: adcq %rbx, %rcx
+; AVX-NEXT: movq %rax, %r11
+; AVX-NEXT: addq %r14, %rdx
+; AVX-NEXT: imulq %r8, %r10
+; AVX-NEXT: addq %rdx, %r10
+; AVX-NEXT: addq %r9, %r11
+; AVX-NEXT: adcq %rbx, %r10
+; AVX-NEXT: movq %r8, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rdx, %r9
+; AVX-NEXT: movq %rax, %rbx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rdx, %rsi
+; AVX-NEXT: movq %rax, %r14
+; AVX-NEXT: addq %r9, %r14
+; AVX-NEXT: adcq $0, %rsi
+; AVX-NEXT: movq %r8, %rax
+; AVX-NEXT: mulq %rbp
+; AVX-NEXT: movq %rdx, %r8
+; AVX-NEXT: movq %rax, %r9
+; AVX-NEXT: addq %r14, %r9
+; AVX-NEXT: adcq %rsi, %r8
; AVX-NEXT: setb %al
-; AVX-NEXT: movzbl %al, %ebp
-; AVX-NEXT: movq %r11, %rax
-; AVX-NEXT: mulq %r8
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: adcq %rbp, %rdx
-; AVX-NEXT: addq %r14, %rax
+; AVX-NEXT: movzbl %al, %esi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rbp
+; AVX-NEXT: addq %r8, %rax
; AVX-NEXT: adcq %rsi, %rdx
-; AVX-NEXT: movq %rdi, 24(%r13)
-; AVX-NEXT: sarq $63, %rdi
-; AVX-NEXT: xorq %rdi, %rdx
-; AVX-NEXT: xorq %rax, %rdi
+; AVX-NEXT: addq %r11, %rax
+; AVX-NEXT: adcq %r10, %rdx
+; AVX-NEXT: movq %r9, 24(%r12)
+; AVX-NEXT: sarq $63, %r9
+; AVX-NEXT: xorq %r9, %rdx
+; AVX-NEXT: xorq %rax, %r9
; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: orq %rdx, %rdi
+; AVX-NEXT: orq %rdx, %r9
; AVX-NEXT: setne %al
; AVX-NEXT: negl %eax
-; AVX-NEXT: negl %r12d
-; AVX-NEXT: vmovd %r12d, %xmm0
+; AVX-NEXT: negl %r15d
+; AVX-NEXT: vmovd %r15d, %xmm0
; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq %r9, 16(%r13)
-; AVX-NEXT: movq %r10, (%r13)
+; AVX-NEXT: movq %rbx, 16(%r12)
+; AVX-NEXT: movq %rdi, (%r12)
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r12
; AVX-NEXT: popq %r13
@@ -3842,119 +3838,117 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: pushq %r13
; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %r9, %r10
-; AVX512F-NEXT: movq %r8, %r9
-; AVX512F-NEXT: movq %rcx, %r14
-; AVX512F-NEXT: movq %rdx, %rcx
-; AVX512F-NEXT: movq %rsi, %r11
-; AVX512F-NEXT: movq %rdi, %r15
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX512F-NEXT: movq %r14, %rdi
-; AVX512F-NEXT: sarq $63, %rdi
-; AVX512F-NEXT: movq %r12, %rbx
-; AVX512F-NEXT: imulq %rdi, %rbx
-; AVX512F-NEXT: movq %r12, %rax
-; AVX512F-NEXT: mulq %rdi
-; AVX512F-NEXT: movq %rax, %rsi
+; AVX512F-NEXT: movq %r9, %rbp
+; AVX512F-NEXT: movq %rcx, %r11
+; AVX512F-NEXT: movq %rdx, %r10
+; AVX512F-NEXT: movq %rsi, %r9
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512F-NEXT: movq %rcx, %r12
+; AVX512F-NEXT: sarq $63, %r12
+; AVX512F-NEXT: movq %r15, %rbx
+; AVX512F-NEXT: imulq %r12, %rbx
+; AVX512F-NEXT: movq %r15, %rax
+; AVX512F-NEXT: mulq %r12
+; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: addq %rbx, %rdx
-; AVX512F-NEXT: imulq %r8, %rdi
-; AVX512F-NEXT: addq %rdx, %rdi
-; AVX512F-NEXT: movq %r8, %rbx
+; AVX512F-NEXT: imulq %rsi, %r12
+; AVX512F-NEXT: addq %rdx, %r12
+; AVX512F-NEXT: movq %rsi, %rbx
; AVX512F-NEXT: sarq $63, %rbx
-; AVX512F-NEXT: movq %rbx, %rbp
-; AVX512F-NEXT: imulq %r14, %rbp
+; AVX512F-NEXT: movq %rbx, %r13
+; AVX512F-NEXT: imulq %r11, %r13
; AVX512F-NEXT: movq %rbx, %rax
-; AVX512F-NEXT: mulq %rcx
-; AVX512F-NEXT: movq %rax, %r13
-; AVX512F-NEXT: addq %rbp, %rdx
-; AVX512F-NEXT: imulq %rcx, %rbx
+; AVX512F-NEXT: mulq %r10
+; AVX512F-NEXT: movq %rax, %r14
+; AVX512F-NEXT: addq %r13, %rdx
+; AVX512F-NEXT: imulq %r10, %rbx
; AVX512F-NEXT: addq %rdx, %rbx
-; AVX512F-NEXT: addq %rsi, %r13
-; AVX512F-NEXT: adcq %rdi, %rbx
-; AVX512F-NEXT: movq %rcx, %rax
-; AVX512F-NEXT: mulq %r12
-; AVX512F-NEXT: movq %rdx, %rbp
-; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: movq %r14, %rax
-; AVX512F-NEXT: mulq %r12
-; AVX512F-NEXT: movq %rdx, %rdi
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: addq %rbp, %rsi
-; AVX512F-NEXT: adcq $0, %rdi
-; AVX512F-NEXT: movq %rcx, %rax
-; AVX512F-NEXT: mulq %r8
-; AVX512F-NEXT: movq %rdx, %rbp
+; AVX512F-NEXT: addq %rcx, %r14
+; AVX512F-NEXT: adcq %r12, %rbx
+; AVX512F-NEXT: movq %r10, %rax
+; AVX512F-NEXT: mulq %r15
+; AVX512F-NEXT: movq %rdx, %r12
; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: addq %rsi, %rcx
-; AVX512F-NEXT: adcq %rdi, %rbp
+; AVX512F-NEXT: movq %r11, %rax
+; AVX512F-NEXT: mulq %r15
+; AVX512F-NEXT: movq %rdx, %r15
+; AVX512F-NEXT: movq %rax, %r13
+; AVX512F-NEXT: addq %r12, %r13
+; AVX512F-NEXT: adcq $0, %r15
+; AVX512F-NEXT: movq %r10, %rax
+; AVX512F-NEXT: mulq %rsi
+; AVX512F-NEXT: movq %rdx, %r12
+; AVX512F-NEXT: movq %rax, %r10
+; AVX512F-NEXT: addq %r13, %r10
+; AVX512F-NEXT: adcq %r15, %r12
; AVX512F-NEXT: setb %al
-; AVX512F-NEXT: movzbl %al, %esi
-; AVX512F-NEXT: movq %r14, %rax
-; AVX512F-NEXT: mulq %r8
-; AVX512F-NEXT: addq %rbp, %rax
-; AVX512F-NEXT: adcq %rsi, %rdx
-; AVX512F-NEXT: addq %r13, %rax
+; AVX512F-NEXT: movzbl %al, %r15d
+; AVX512F-NEXT: movq %r11, %rax
+; AVX512F-NEXT: mulq %rsi
+; AVX512F-NEXT: addq %r12, %rax
+; AVX512F-NEXT: adcq %r15, %rdx
+; AVX512F-NEXT: addq %r14, %rax
; AVX512F-NEXT: adcq %rbx, %rdx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX512F-NEXT: movq %rcx, 24(%r8)
-; AVX512F-NEXT: sarq $63, %rcx
-; AVX512F-NEXT: xorq %rcx, %rdx
-; AVX512F-NEXT: xorq %rax, %rcx
-; AVX512F-NEXT: orq %rdx, %rcx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX512F-NEXT: movq %r10, 24(%r12)
+; AVX512F-NEXT: sarq $63, %r10
+; AVX512F-NEXT: xorq %r10, %rdx
+; AVX512F-NEXT: xorq %rax, %r10
+; AVX512F-NEXT: orq %rdx, %r10
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: kmovw %eax, %k0
-; AVX512F-NEXT: movq %r11, %rdi
-; AVX512F-NEXT: sarq $63, %rdi
; AVX512F-NEXT: movq %r9, %rsi
-; AVX512F-NEXT: imulq %rdi, %rsi
-; AVX512F-NEXT: movq %r9, %rax
-; AVX512F-NEXT: mulq %rdi
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: addq %rsi, %rdx
-; AVX512F-NEXT: imulq %r10, %rdi
-; AVX512F-NEXT: addq %rdx, %rdi
-; AVX512F-NEXT: movq %r10, %rsi
; AVX512F-NEXT: sarq $63, %rsi
-; AVX512F-NEXT: movq %rsi, %rbp
-; AVX512F-NEXT: imulq %r11, %rbp
-; AVX512F-NEXT: movq %rsi, %rax
-; AVX512F-NEXT: mulq %r15
-; AVX512F-NEXT: movq %rax, %r12
-; AVX512F-NEXT: addq %rbp, %rdx
-; AVX512F-NEXT: imulq %r15, %rsi
+; AVX512F-NEXT: movq %r8, %r11
+; AVX512F-NEXT: imulq %rsi, %r11
+; AVX512F-NEXT: movq %r8, %rax
+; AVX512F-NEXT: mulq %rsi
+; AVX512F-NEXT: movq %rax, %r10
+; AVX512F-NEXT: addq %r11, %rdx
+; AVX512F-NEXT: imulq %rbp, %rsi
; AVX512F-NEXT: addq %rdx, %rsi
-; AVX512F-NEXT: addq %rcx, %r12
-; AVX512F-NEXT: adcq %rdi, %rsi
-; AVX512F-NEXT: movq %r15, %rax
-; AVX512F-NEXT: mulq %r9
-; AVX512F-NEXT: movq %rdx, %rcx
-; AVX512F-NEXT: movq %rax, %r14
+; AVX512F-NEXT: movq %rbp, %r11
+; AVX512F-NEXT: sarq $63, %r11
+; AVX512F-NEXT: movq %r11, %r14
+; AVX512F-NEXT: imulq %r9, %r14
; AVX512F-NEXT: movq %r11, %rax
-; AVX512F-NEXT: mulq %r9
-; AVX512F-NEXT: movq %rdx, %rbp
+; AVX512F-NEXT: mulq %rdi
; AVX512F-NEXT: movq %rax, %rbx
-; AVX512F-NEXT: addq %rcx, %rbx
-; AVX512F-NEXT: adcq $0, %rbp
-; AVX512F-NEXT: movq %r15, %rax
-; AVX512F-NEXT: mulq %r10
-; AVX512F-NEXT: movq %rdx, %rcx
-; AVX512F-NEXT: movq %rax, %rdi
-; AVX512F-NEXT: addq %rbx, %rdi
-; AVX512F-NEXT: adcq %rbp, %rcx
+; AVX512F-NEXT: addq %r14, %rdx
+; AVX512F-NEXT: imulq %rdi, %r11
+; AVX512F-NEXT: addq %rdx, %r11
+; AVX512F-NEXT: addq %r10, %rbx
+; AVX512F-NEXT: adcq %rsi, %r11
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: mulq %r8
+; AVX512F-NEXT: movq %rdx, %r10
+; AVX512F-NEXT: movq %rax, %r14
+; AVX512F-NEXT: movq %r9, %rax
+; AVX512F-NEXT: mulq %r8
+; AVX512F-NEXT: movq %rdx, %r8
+; AVX512F-NEXT: movq %rax, %r15
+; AVX512F-NEXT: addq %r10, %r15
+; AVX512F-NEXT: adcq $0, %r8
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: mulq %rbp
+; AVX512F-NEXT: movq %rdx, %rdi
+; AVX512F-NEXT: movq %rax, %r10
+; AVX512F-NEXT: addq %r15, %r10
+; AVX512F-NEXT: adcq %r8, %rdi
; AVX512F-NEXT: setb %al
-; AVX512F-NEXT: movzbl %al, %ebp
-; AVX512F-NEXT: movq %r11, %rax
-; AVX512F-NEXT: mulq %r10
-; AVX512F-NEXT: addq %rcx, %rax
-; AVX512F-NEXT: adcq %rbp, %rdx
-; AVX512F-NEXT: addq %r12, %rax
+; AVX512F-NEXT: movzbl %al, %esi
+; AVX512F-NEXT: movq %r9, %rax
+; AVX512F-NEXT: mulq %rbp
+; AVX512F-NEXT: addq %rdi, %rax
; AVX512F-NEXT: adcq %rsi, %rdx
-; AVX512F-NEXT: movq %rdi, 8(%r8)
-; AVX512F-NEXT: sarq $63, %rdi
-; AVX512F-NEXT: xorq %rdi, %rdx
-; AVX512F-NEXT: xorq %rax, %rdi
-; AVX512F-NEXT: orq %rdx, %rdi
+; AVX512F-NEXT: addq %rbx, %rax
+; AVX512F-NEXT: adcq %r11, %rdx
+; AVX512F-NEXT: movq %r10, 8(%r12)
+; AVX512F-NEXT: sarq $63, %r10
+; AVX512F-NEXT: xorq %r10, %rdx
+; AVX512F-NEXT: xorq %rax, %r10
+; AVX512F-NEXT: orq %rdx, %r10
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: andl $1, %eax
; AVX512F-NEXT: kmovw %eax, %k1
@@ -3962,9 +3956,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: korw %k0, %k1, %k1
; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT: movq %rax, 16(%r8)
-; AVX512F-NEXT: movq %r14, (%r8)
+; AVX512F-NEXT: movq %rcx, 16(%r12)
+; AVX512F-NEXT: movq %r14, (%r12)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r12
; AVX512F-NEXT: popq %r13
@@ -3981,119 +3974,117 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: pushq %r13
; AVX512BW-NEXT: pushq %r12
; AVX512BW-NEXT: pushq %rbx
-; AVX512BW-NEXT: movq %r9, %r10
-; AVX512BW-NEXT: movq %r8, %r9
-; AVX512BW-NEXT: movq %rcx, %r14
-; AVX512BW-NEXT: movq %rdx, %rcx
-; AVX512BW-NEXT: movq %rsi, %r11
-; AVX512BW-NEXT: movq %rdi, %r15
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX512BW-NEXT: movq %r14, %rdi
-; AVX512BW-NEXT: sarq $63, %rdi
-; AVX512BW-NEXT: movq %r12, %rbx
-; AVX512BW-NEXT: imulq %rdi, %rbx
-; AVX512BW-NEXT: movq %r12, %rax
-; AVX512BW-NEXT: mulq %rdi
-; AVX512BW-NEXT: movq %rax, %rsi
+; AVX512BW-NEXT: movq %r9, %rbp
+; AVX512BW-NEXT: movq %rcx, %r11
+; AVX512BW-NEXT: movq %rdx, %r10
+; AVX512BW-NEXT: movq %rsi, %r9
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512BW-NEXT: movq %rcx, %r12
+; AVX512BW-NEXT: sarq $63, %r12
+; AVX512BW-NEXT: movq %r15, %rbx
+; AVX512BW-NEXT: imulq %r12, %rbx
+; AVX512BW-NEXT: movq %r15, %rax
+; AVX512BW-NEXT: mulq %r12
+; AVX512BW-NEXT: movq %rax, %rcx
; AVX512BW-NEXT: addq %rbx, %rdx
-; AVX512BW-NEXT: imulq %r8, %rdi
-; AVX512BW-NEXT: addq %rdx, %rdi
-; AVX512BW-NEXT: movq %r8, %rbx
+; AVX512BW-NEXT: imulq %rsi, %r12
+; AVX512BW-NEXT: addq %rdx, %r12
+; AVX512BW-NEXT: movq %rsi, %rbx
; AVX512BW-NEXT: sarq $63, %rbx
-; AVX512BW-NEXT: movq %rbx, %rbp
-; AVX512BW-NEXT: imulq %r14, %rbp
+; AVX512BW-NEXT: movq %rbx, %r13
+; AVX512BW-NEXT: imulq %r11, %r13
; AVX512BW-NEXT: movq %rbx, %rax
-; AVX512BW-NEXT: mulq %rcx
-; AVX512BW-NEXT: movq %rax, %r13
-; AVX512BW-NEXT: addq %rbp, %rdx
-; AVX512BW-NEXT: imulq %rcx, %rbx
+; AVX512BW-NEXT: mulq %r10
+; AVX512BW-NEXT: movq %rax, %r14
+; AVX512BW-NEXT: addq %r13, %rdx
+; AVX512BW-NEXT: imulq %r10, %rbx
; AVX512BW-NEXT: addq %rdx, %rbx
-; AVX512BW-NEXT: addq %rsi, %r13
-; AVX512BW-NEXT: adcq %rdi, %rbx
-; AVX512BW-NEXT: movq %rcx, %rax
-; AVX512BW-NEXT: mulq %r12
-; AVX512BW-NEXT: movq %rdx, %rbp
-; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: movq %r14, %rax
-; AVX512BW-NEXT: mulq %r12
-; AVX512BW-NEXT: movq %rdx, %rdi
-; AVX512BW-NEXT: movq %rax, %rsi
-; AVX512BW-NEXT: addq %rbp, %rsi
-; AVX512BW-NEXT: adcq $0, %rdi
-; AVX512BW-NEXT: movq %rcx, %rax
-; AVX512BW-NEXT: mulq %r8
-; AVX512BW-NEXT: movq %rdx, %rbp
+; AVX512BW-NEXT: addq %rcx, %r14
+; AVX512BW-NEXT: adcq %r12, %rbx
+; AVX512BW-NEXT: movq %r10, %rax
+; AVX512BW-NEXT: mulq %r15
+; AVX512BW-NEXT: movq %rdx, %r12
; AVX512BW-NEXT: movq %rax, %rcx
-; AVX512BW-NEXT: addq %rsi, %rcx
-; AVX512BW-NEXT: adcq %rdi, %rbp
+; AVX512BW-NEXT: movq %r11, %rax
+; AVX512BW-NEXT: mulq %r15
+; AVX512BW-NEXT: movq %rdx, %r15
+; AVX512BW-NEXT: movq %rax, %r13
+; AVX512BW-NEXT: addq %r12, %r13
+; AVX512BW-NEXT: adcq $0, %r15
+; AVX512BW-NEXT: movq %r10, %rax
+; AVX512BW-NEXT: mulq %rsi
+; AVX512BW-NEXT: movq %rdx, %r12
+; AVX512BW-NEXT: movq %rax, %r10
+; AVX512BW-NEXT: addq %r13, %r10
+; AVX512BW-NEXT: adcq %r15, %r12
; AVX512BW-NEXT: setb %al
-; AVX512BW-NEXT: movzbl %al, %esi
-; AVX512BW-NEXT: movq %r14, %rax
-; AVX512BW-NEXT: mulq %r8
-; AVX512BW-NEXT: addq %rbp, %rax
-; AVX512BW-NEXT: adcq %rsi, %rdx
-; AVX512BW-NEXT: addq %r13, %rax
+; AVX512BW-NEXT: movzbl %al, %r15d
+; AVX512BW-NEXT: movq %r11, %rax
+; AVX512BW-NEXT: mulq %rsi
+; AVX512BW-NEXT: addq %r12, %rax
+; AVX512BW-NEXT: adcq %r15, %rdx
+; AVX512BW-NEXT: addq %r14, %rax
; AVX512BW-NEXT: adcq %rbx, %rdx
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX512BW-NEXT: movq %rcx, 24(%r8)
-; AVX512BW-NEXT: sarq $63, %rcx
-; AVX512BW-NEXT: xorq %rcx, %rdx
-; AVX512BW-NEXT: xorq %rax, %rcx
-; AVX512BW-NEXT: orq %rdx, %rcx
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX512BW-NEXT: movq %r10, 24(%r12)
+; AVX512BW-NEXT: sarq $63, %r10
+; AVX512BW-NEXT: xorq %r10, %rdx
+; AVX512BW-NEXT: xorq %rax, %r10
+; AVX512BW-NEXT: orq %rdx, %r10
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: kmovd %eax, %k0
-; AVX512BW-NEXT: movq %r11, %rdi
-; AVX512BW-NEXT: sarq $63, %rdi
; AVX512BW-NEXT: movq %r9, %rsi
-; AVX512BW-NEXT: imulq %rdi, %rsi
-; AVX512BW-NEXT: movq %r9, %rax
-; AVX512BW-NEXT: mulq %rdi
-; AVX512BW-NEXT: movq %rax, %rcx
-; AVX512BW-NEXT: addq %rsi, %rdx
-; AVX512BW-NEXT: imulq %r10, %rdi
-; AVX512BW-NEXT: addq %rdx, %rdi
-; AVX512BW-NEXT: movq %r10, %rsi
; AVX512BW-NEXT: sarq $63, %rsi
-; AVX512BW-NEXT: movq %rsi, %rbp
-; AVX512BW-NEXT: imulq %r11, %rbp
-; AVX512BW-NEXT: movq %rsi, %rax
-; AVX512BW-NEXT: mulq %r15
-; AVX512BW-NEXT: movq %rax, %r12
-; AVX512BW-NEXT: addq %rbp, %rdx
-; AVX512BW-NEXT: imulq %r15, %rsi
+; AVX512BW-NEXT: movq %r8, %r11
+; AVX512BW-NEXT: imulq %rsi, %r11
+; AVX512BW-NEXT: movq %r8, %rax
+; AVX512BW-NEXT: mulq %rsi
+; AVX512BW-NEXT: movq %rax, %r10
+; AVX512BW-NEXT: addq %r11, %rdx
+; AVX512BW-NEXT: imulq %rbp, %rsi
; AVX512BW-NEXT: addq %rdx, %rsi
-; AVX512BW-NEXT: addq %rcx, %r12
-; AVX512BW-NEXT: adcq %rdi, %rsi
-; AVX512BW-NEXT: movq %r15, %rax
-; AVX512BW-NEXT: mulq %r9
-; AVX512BW-NEXT: movq %rdx, %rcx
-; AVX512BW-NEXT: movq %rax, %r14
+; AVX512BW-NEXT: movq %rbp, %r11
+; AVX512BW-NEXT: sarq $63, %r11
+; AVX512BW-NEXT: movq %r11, %r14
+; AVX512BW-NEXT: imulq %r9, %r14
; AVX512BW-NEXT: movq %r11, %rax
-; AVX512BW-NEXT: mulq %r9
-; AVX512BW-NEXT: movq %rdx, %rbp
+; AVX512BW-NEXT: mulq %rdi
; AVX512BW-NEXT: movq %rax, %rbx
-; AVX512BW-NEXT: addq %rcx, %rbx
-; AVX512BW-NEXT: adcq $0, %rbp
-; AVX512BW-NEXT: movq %r15, %rax
-; AVX512BW-NEXT: mulq %r10
-; AVX512BW-NEXT: movq %rdx, %rcx
-; AVX512BW-NEXT: movq %rax, %rdi
-; AVX512BW-NEXT: addq %rbx, %rdi
-; AVX512BW-NEXT: adcq %rbp, %rcx
+; AVX512BW-NEXT: addq %r14, %rdx
+; AVX512BW-NEXT: imulq %rdi, %r11
+; AVX512BW-NEXT: addq %rdx, %r11
+; AVX512BW-NEXT: addq %r10, %rbx
+; AVX512BW-NEXT: adcq %rsi, %r11
+; AVX512BW-NEXT: movq %rdi, %rax
+; AVX512BW-NEXT: mulq %r8
+; AVX512BW-NEXT: movq %rdx, %r10
+; AVX512BW-NEXT: movq %rax, %r14
+; AVX512BW-NEXT: movq %r9, %rax
+; AVX512BW-NEXT: mulq %r8
+; AVX512BW-NEXT: movq %rdx, %r8
+; AVX512BW-NEXT: movq %rax, %r15
+; AVX512BW-NEXT: addq %r10, %r15
+; AVX512BW-NEXT: adcq $0, %r8
+; AVX512BW-NEXT: movq %rdi, %rax
+; AVX512BW-NEXT: mulq %rbp
+; AVX512BW-NEXT: movq %rdx, %rdi
+; AVX512BW-NEXT: movq %rax, %r10
+; AVX512BW-NEXT: addq %r15, %r10
+; AVX512BW-NEXT: adcq %r8, %rdi
; AVX512BW-NEXT: setb %al
-; AVX512BW-NEXT: movzbl %al, %ebp
-; AVX512BW-NEXT: movq %r11, %rax
-; AVX512BW-NEXT: mulq %r10
-; AVX512BW-NEXT: addq %rcx, %rax
-; AVX512BW-NEXT: adcq %rbp, %rdx
-; AVX512BW-NEXT: addq %r12, %rax
+; AVX512BW-NEXT: movzbl %al, %esi
+; AVX512BW-NEXT: movq %r9, %rax
+; AVX512BW-NEXT: mulq %rbp
+; AVX512BW-NEXT: addq %rdi, %rax
; AVX512BW-NEXT: adcq %rsi, %rdx
-; AVX512BW-NEXT: movq %rdi, 8(%r8)
-; AVX512BW-NEXT: sarq $63, %rdi
-; AVX512BW-NEXT: xorq %rdi, %rdx
-; AVX512BW-NEXT: xorq %rax, %rdi
-; AVX512BW-NEXT: orq %rdx, %rdi
+; AVX512BW-NEXT: addq %rbx, %rax
+; AVX512BW-NEXT: adcq %r11, %rdx
+; AVX512BW-NEXT: movq %r10, 8(%r12)
+; AVX512BW-NEXT: sarq $63, %r10
+; AVX512BW-NEXT: xorq %r10, %rdx
+; AVX512BW-NEXT: xorq %rax, %r10
+; AVX512BW-NEXT: orq %rdx, %r10
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: andl $1, %eax
; AVX512BW-NEXT: kmovw %eax, %k1
@@ -4101,9 +4092,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: movq %rax, 16(%r8)
-; AVX512BW-NEXT: movq %r14, (%r8)
+; AVX512BW-NEXT: movq %rcx, 16(%r12)
+; AVX512BW-NEXT: movq %r14, (%r12)
; AVX512BW-NEXT: popq %rbx
; AVX512BW-NEXT: popq %r12
; AVX512BW-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index 451f53f98697d..41b6ba51ec579 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -460,8 +460,8 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm8
-; AVX1-NEXT: vpcmpgtd %xmm8, %xmm7, %xmm7
+; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7
; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm7
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
@@ -470,26 +470,26 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpsubd %xmm6, %xmm4, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
+; AVX1-NEXT: vpsubd %xmm6, %xmm8, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm8, %xmm8
+; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm5
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi)
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi)
; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi)
; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
@@ -1046,110 +1046,110 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind {
; SSE2-LABEL: ssubo_v2i128:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: subq %r8, %rdi
; SSE2-NEXT: sbbq %r9, %rsi
; SSE2-NEXT: seto %r8b
; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: seto %al
-; SSE2-NEXT: movzbl %al, %eax
-; SSE2-NEXT: negl %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: movzbl %r8b, %eax
-; SSE2-NEXT: negl %eax
-; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: seto %r9b
+; SSE2-NEXT: movzbl %r9b, %r9d
+; SSE2-NEXT: negl %r9d
+; SSE2-NEXT: movd %r9d, %xmm1
+; SSE2-NEXT: movzbl %r8b, %r8d
+; SSE2-NEXT: negl %r8d
+; SSE2-NEXT: movd %r8d, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %rdx, 16(%r10)
-; SSE2-NEXT: movq %rdi, (%r10)
-; SSE2-NEXT: movq %rcx, 24(%r10)
-; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: movq %rdx, 16(%rax)
+; SSE2-NEXT: movq %rdi, (%rax)
+; SSE2-NEXT: movq %rcx, 24(%rax)
+; SSE2-NEXT: movq %rsi, 8(%rax)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: ssubo_v2i128:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSSE3-NEXT: subq %r8, %rdi
; SSSE3-NEXT: sbbq %r9, %rsi
; SSSE3-NEXT: seto %r8b
; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: seto %al
-; SSSE3-NEXT: movzbl %al, %eax
-; SSSE3-NEXT: negl %eax
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: movzbl %r8b, %eax
-; SSSE3-NEXT: negl %eax
-; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: seto %r9b
+; SSSE3-NEXT: movzbl %r9b, %r9d
+; SSSE3-NEXT: negl %r9d
+; SSSE3-NEXT: movd %r9d, %xmm1
+; SSSE3-NEXT: movzbl %r8b, %r8d
+; SSSE3-NEXT: negl %r8d
+; SSSE3-NEXT: movd %r8d, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %rdx, 16(%r10)
-; SSSE3-NEXT: movq %rdi, (%r10)
-; SSSE3-NEXT: movq %rcx, 24(%r10)
-; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: movq %rdx, 16(%rax)
+; SSSE3-NEXT: movq %rdi, (%rax)
+; SSSE3-NEXT: movq %rcx, 24(%rax)
+; SSSE3-NEXT: movq %rsi, 8(%rax)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: ssubo_v2i128:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE41-NEXT: subq %r8, %rdi
; SSE41-NEXT: sbbq %r9, %rsi
; SSE41-NEXT: seto %r8b
; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: seto %al
-; SSE41-NEXT: movzbl %al, %r9d
+; SSE41-NEXT: seto %r9b
+; SSE41-NEXT: movzbl %r9b, %r9d
; SSE41-NEXT: negl %r9d
-; SSE41-NEXT: movzbl %r8b, %eax
-; SSE41-NEXT: negl %eax
-; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: movzbl %r8b, %r8d
+; SSE41-NEXT: negl %r8d
+; SSE41-NEXT: movd %r8d, %xmm0
; SSE41-NEXT: pinsrd $1, %r9d, %xmm0
-; SSE41-NEXT: movq %rdx, 16(%r10)
-; SSE41-NEXT: movq %rdi, (%r10)
-; SSE41-NEXT: movq %rcx, 24(%r10)
-; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: movq %rdx, 16(%rax)
+; SSE41-NEXT: movq %rdi, (%rax)
+; SSE41-NEXT: movq %rcx, 24(%rax)
+; SSE41-NEXT: movq %rsi, 8(%rax)
; SSE41-NEXT: retq
;
; AVX-LABEL: ssubo_v2i128:
; AVX: # %bb.0:
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: subq %r8, %rdi
; AVX-NEXT: sbbq %r9, %rsi
; AVX-NEXT: seto %r8b
; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: seto %al
-; AVX-NEXT: movzbl %al, %r9d
+; AVX-NEXT: seto %r9b
+; AVX-NEXT: movzbl %r9b, %r9d
; AVX-NEXT: negl %r9d
-; AVX-NEXT: movzbl %r8b, %eax
-; AVX-NEXT: negl %eax
-; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: movzbl %r8b, %r8d
+; AVX-NEXT: negl %r8d
+; AVX-NEXT: vmovd %r8d, %xmm0
; AVX-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0
-; AVX-NEXT: movq %rdx, 16(%r10)
-; AVX-NEXT: movq %rdi, (%r10)
-; AVX-NEXT: movq %rcx, 24(%r10)
-; AVX-NEXT: movq %rsi, 8(%r10)
+; AVX-NEXT: movq %rdx, 16(%rax)
+; AVX-NEXT: movq %rdi, (%rax)
+; AVX-NEXT: movq %rcx, 24(%rax)
+; AVX-NEXT: movq %rsi, 8(%rax)
; AVX-NEXT: retq
;
; AVX512-LABEL: ssubo_v2i128:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; AVX512-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: seto %al
-; AVX512-NEXT: kmovd %eax, %k0
+; AVX512-NEXT: seto %r10b
+; AVX512-NEXT: kmovd %r10d, %k0
; AVX512-NEXT: subq %r8, %rdi
; AVX512-NEXT: sbbq %r9, %rsi
-; AVX512-NEXT: seto %al
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: seto %r8b
+; AVX512-NEXT: andl $1, %r8d
+; AVX512-NEXT: kmovw %r8d, %k1
; AVX512-NEXT: kshiftlw $1, %k0, %k0
; AVX512-NEXT: korw %k0, %k1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: movq %rdx, 16(%r10)
-; AVX512-NEXT: movq %rdi, (%r10)
-; AVX512-NEXT: movq %rcx, 24(%r10)
-; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: movq %rdx, 16(%rax)
+; AVX512-NEXT: movq %rdi, (%rax)
+; AVX512-NEXT: movq %rcx, 24(%rax)
+; AVX512-NEXT: movq %rsi, 8(%rax)
; AVX512-NEXT: retq
%t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
%val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 60b838e99409d..7c5ef84ecb19b 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1139,102 +1139,102 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind {
; SSE2-LABEL: uaddo_v2i128:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: xorl %r11d, %r11d
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: xorl %r10d, %r10d
; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movl $0, %eax
-; SSE2-NEXT: sbbl %eax, %eax
+; SSE2-NEXT: movl $0, %r11d
+; SSE2-NEXT: sbbl %r11d, %r11d
; SSE2-NEXT: addq %r8, %rdi
; SSE2-NEXT: adcq %r9, %rsi
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: sbbl %r11d, %r11d
-; SSE2-NEXT: movd %r11d, %xmm0
+; SSE2-NEXT: movd %r11d, %xmm1
+; SSE2-NEXT: sbbl %r10d, %r10d
+; SSE2-NEXT: movd %r10d, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %rdx, 16(%r10)
-; SSE2-NEXT: movq %rdi, (%r10)
-; SSE2-NEXT: movq %rcx, 24(%r10)
-; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: movq %rdx, 16(%rax)
+; SSE2-NEXT: movq %rdi, (%rax)
+; SSE2-NEXT: movq %rcx, 24(%rax)
+; SSE2-NEXT: movq %rsi, 8(%rax)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v2i128:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSSE3-NEXT: xorl %r11d, %r11d
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: xorl %r10d, %r10d
; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: movl $0, %eax
-; SSSE3-NEXT: sbbl %eax, %eax
+; SSSE3-NEXT: movl $0, %r11d
+; SSSE3-NEXT: sbbl %r11d, %r11d
; SSSE3-NEXT: addq %r8, %rdi
; SSSE3-NEXT: adcq %r9, %rsi
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: sbbl %r11d, %r11d
-; SSSE3-NEXT: movd %r11d, %xmm0
+; SSSE3-NEXT: movd %r11d, %xmm1
+; SSSE3-NEXT: sbbl %r10d, %r10d
+; SSSE3-NEXT: movd %r10d, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %rdx, 16(%r10)
-; SSSE3-NEXT: movq %rdi, (%r10)
-; SSSE3-NEXT: movq %rcx, 24(%r10)
-; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: movq %rdx, 16(%rax)
+; SSSE3-NEXT: movq %rdi, (%rax)
+; SSSE3-NEXT: movq %rcx, 24(%rax)
+; SSSE3-NEXT: movq %rsi, 8(%rax)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v2i128:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE41-NEXT: xorl %r11d, %r11d
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE41-NEXT: xorl %r10d, %r10d
; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: movl $0, %eax
-; SSE41-NEXT: sbbl %eax, %eax
+; SSE41-NEXT: movl $0, %r11d
+; SSE41-NEXT: sbbl %r11d, %r11d
; SSE41-NEXT: addq %r8, %rdi
; SSE41-NEXT: adcq %r9, %rsi
-; SSE41-NEXT: sbbl %r11d, %r11d
-; SSE41-NEXT: movd %r11d, %xmm0
-; SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; SSE41-NEXT: movq %rdx, 16(%r10)
-; SSE41-NEXT: movq %rdi, (%r10)
-; SSE41-NEXT: movq %rcx, 24(%r10)
-; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: sbbl %r10d, %r10d
+; SSE41-NEXT: movd %r10d, %xmm0
+; SSE41-NEXT: pinsrd $1, %r11d, %xmm0
+; SSE41-NEXT: movq %rdx, 16(%rax)
+; SSE41-NEXT: movq %rdi, (%rax)
+; SSE41-NEXT: movq %rcx, 24(%rax)
+; SSE41-NEXT: movq %rsi, 8(%rax)
; SSE41-NEXT: retq
;
; AVX-LABEL: uaddo_v2i128:
; AVX: # %bb.0:
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX-NEXT: xorl %r11d, %r11d
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: xorl %r10d, %r10d
; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movl $0, %eax
-; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: movl $0, %r11d
+; AVX-NEXT: sbbl %r11d, %r11d
; AVX-NEXT: addq %r8, %rdi
; AVX-NEXT: adcq %r9, %rsi
-; AVX-NEXT: sbbl %r11d, %r11d
-; AVX-NEXT: vmovd %r11d, %xmm0
-; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq %rdx, 16(%r10)
-; AVX-NEXT: movq %rdi, (%r10)
-; AVX-NEXT: movq %rcx, 24(%r10)
-; AVX-NEXT: movq %rsi, 8(%r10)
+; AVX-NEXT: sbbl %r10d, %r10d
+; AVX-NEXT: vmovd %r10d, %xmm0
+; AVX-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0
+; AVX-NEXT: movq %rdx, 16(%rax)
+; AVX-NEXT: movq %rdi, (%rax)
+; AVX-NEXT: movq %rcx, 24(%rax)
+; AVX-NEXT: movq %rsi, 8(%rax)
; AVX-NEXT: retq
;
; AVX512-LABEL: uaddo_v2i128:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; AVX512-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: setb %al
-; AVX512-NEXT: kmovd %eax, %k0
+; AVX512-NEXT: setb %r10b
+; AVX512-NEXT: kmovd %r10d, %k0
; AVX512-NEXT: addq %r8, %rdi
; AVX512-NEXT: adcq %r9, %rsi
-; AVX512-NEXT: setb %al
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: setb %r8b
+; AVX512-NEXT: andl $1, %r8d
+; AVX512-NEXT: kmovw %r8d, %k1
; AVX512-NEXT: kshiftlw $1, %k0, %k0
; AVX512-NEXT: korw %k0, %k1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: movq %rdx, 16(%r10)
-; AVX512-NEXT: movq %rdi, (%r10)
-; AVX512-NEXT: movq %rcx, 24(%r10)
-; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: movq %rdx, 16(%rax)
+; AVX512-NEXT: movq %rdi, (%rax)
+; AVX512-NEXT: movq %rcx, 24(%rax)
+; AVX512-NEXT: movq %rsi, 8(%rax)
; AVX512-NEXT: retq
%t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
%val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index a1fa436507ce7..f7d4deeb85411 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -485,16 +485,16 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pmuludq %xmm3, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
-; SSE41-NEXT: pxor %xmm8, %xmm8
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm1
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm1
; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
; SSE41-NEXT: pxor %xmm6, %xmm1
; SSE41-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
-; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE41-NEXT: pmuludq %xmm7, %xmm3
+; SSE41-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE41-NEXT: pmuludq %xmm7, %xmm8
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3],xmm4[4,5],xmm8[6,7]
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm4
; SSE41-NEXT: pxor %xmm6, %xmm4
; SSE41-NEXT: pmulld %xmm2, %xmm0
; SSE41-NEXT: movq %xmm5, 16(%rcx)
@@ -513,17 +513,17 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7]
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7]
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5
; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -587,18 +587,18 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: pmuludq %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
; SSE2-NEXT: pxor %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
+; SSE2-NEXT: pmuludq %xmm8, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
; SSE2-NEXT: pxor %xmm7, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
@@ -621,18 +621,18 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: pmuludq %xmm5, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: pxor %xmm8, %xmm8
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7
; SSSE3-NEXT: pxor %xmm7, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm3, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
+; SSSE3-NEXT: pmuludq %xmm8, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
; SSSE3-NEXT: pxor %xmm7, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
@@ -654,18 +654,18 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pmuludq %xmm2, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
-; SSE41-NEXT: pxor %xmm8, %xmm8
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm4
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
; SSE41-NEXT: pcmpeqd %xmm7, %xmm7
; SSE41-NEXT: pxor %xmm7, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm5, %xmm8
; SSE41-NEXT: movdqa %xmm1, %xmm5
; SSE41-NEXT: pmuludq %xmm3, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm5
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5],xmm8[6,7]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm5
; SSE41-NEXT: pxor %xmm7, %xmm5
; SSE41-NEXT: pmulld %xmm2, %xmm0
; SSE41-NEXT: pmulld %xmm3, %xmm1
@@ -685,17 +685,17 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7]
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7]
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5
; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -747,125 +747,125 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind {
; SSE2-LABEL: umulo_v16i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm10, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm11
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3]
+; SSE2-NEXT: pxor %xmm11, %xmm11
+; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm12, %xmm12
+; SSE2-NEXT: pxor %xmm12, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm13, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm15
-; SSE2-NEXT: pxor %xmm11, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSE2-NEXT: pcmpeqd %xmm11, %xmm4
+; SSE2-NEXT: pxor %xmm12, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm14, %xmm13
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE2-NEXT: pxor %xmm11, %xmm5
+; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; SSE2-NEXT: pcmpeqd %xmm11, %xmm8
+; SSE2-NEXT: pxor %xmm12, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm7, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm14, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm6
-; SSE2-NEXT: pxor %xmm11, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1]
+; SSE2-NEXT: pcmpeqd %xmm11, %xmm6
+; SSE2-NEXT: pxor %xmm12, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
-; SSE2-NEXT: movdqa %xmm8, (%rdi)
-; SSE2-NEXT: movdqa %xmm15, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm9, (%rdi)
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm8, %xmm2
; SSE2-NEXT: movdqa %xmm6, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: umulo_v16i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm8
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm4, %xmm8
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm10, %xmm9
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3]
+; SSSE3-NEXT: movdqa %xmm0, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm4, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm8, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSSE3-NEXT: pxor %xmm10, %xmm10
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm11
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm11, %xmm11
+; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm12, %xmm12
+; SSSE3-NEXT: pxor %xmm12, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm13, %xmm12
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm15
-; SSSE3-NEXT: pxor %xmm11, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm8, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4
+; SSSE3-NEXT: pxor %xmm12, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm14, %xmm13
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5
-; SSSE3-NEXT: pxor %xmm11, %xmm5
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; SSSE3-NEXT: pcmpeqd %xmm11, %xmm8
+; SSSE3-NEXT: pxor %xmm12, %xmm8
; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm7, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm14, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm6
-; SSSE3-NEXT: pxor %xmm11, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1]
+; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6
+; SSSE3-NEXT: pxor %xmm12, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
; SSSE3-NEXT: movdqa %xmm3, 48(%rdi)
; SSSE3-NEXT: movdqa %xmm2, 32(%rdi)
; SSSE3-NEXT: movdqa %xmm1, 16(%rdi)
-; SSSE3-NEXT: movdqa %xmm8, (%rdi)
-; SSSE3-NEXT: movdqa %xmm15, %xmm1
-; SSSE3-NEXT: movdqa %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm9, (%rdi)
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm8, %xmm2
; SSSE3-NEXT: movdqa %xmm6, %xmm3
; SSSE3-NEXT: retq
;
@@ -925,63 +925,63 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
;
; AVX1-LABEL: umulo_v16i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,3,3]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
; AVX1-NEXT: vpmuludq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpmuludq %xmm10, %xmm12, %xmm7
+; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm7
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7]
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9
-; AVX1-NEXT: vpxor %xmm7, %xmm9, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5],xmm4[6,7]
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm8, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm9, %xmm4
-; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm11
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm13
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpackssdw %xmm13, %xmm5, %xmm5
-; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm9, %xmm10, %xmm9
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5],xmm9[6,7]
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm9, %xmm9
+; AVX1-NEXT: vpxor %xmm8, %xmm9, %xmm9
+; AVX1-NEXT: vpackssdw %xmm7, %xmm9, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm10, %xmm12, %xmm10
+; AVX1-NEXT: vpmuludq %xmm9, %xmm11, %xmm12
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3],xmm12[4,5],xmm10[6,7]
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm10, %xmm10
+; AVX1-NEXT: vpxor %xmm8, %xmm10, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm12, %xmm13, %xmm12
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm13
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3],xmm13[4,5],xmm12[6,7]
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm12, %xmm6
+; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm6
+; AVX1-NEXT: vpackssdw %xmm10, %xmm6, %xmm6
+; AVX1-NEXT: vpacksswb %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpmulld %xmm9, %xmm11, %xmm8
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm6
-; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
+; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpacksswb %xmm11, %xmm11, %xmm1
+; AVX1-NEXT: vpacksswb %xmm7, %xmm7, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
-; AVX1-NEXT: vmovdqa %xmm6, 48(%rdi)
+; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi)
; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi)
-; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm8, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
; AVX1-NEXT: retq
;
@@ -1252,79 +1252,79 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
; SSE2-NEXT: pmullw %xmm4, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm8
-; SSE2-NEXT: pand %xmm11, %xmm8
-; SSE2-NEXT: packuswb %xmm7, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: packuswb %xmm7, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm7
; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
-; SSE2-NEXT: pmullw %xmm7, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
+; SSE2-NEXT: pmullw %xmm7, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm7
+; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
; SSE2-NEXT: pmullw %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm11
-; SSE2-NEXT: packuswb %xmm7, %xmm11
-; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: packuswb %xmm7, %xmm4
+; SSE2-NEXT: psrlw $8, %xmm8
; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm8, %xmm1
; SSE2-NEXT: pcmpeqb %xmm5, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm3, %xmm1
; SSE2-NEXT: psrlw $8, %xmm6
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm6, %xmm0
; SSE2-NEXT: pcmpeqb %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pslld $31, %xmm5
-; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm6
+; SSE2-NEXT: psrad $31, %xmm6
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $31, %xmm6
-; SSE2-NEXT: psrad $31, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pslld $31, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; SSE2-NEXT: psrad $24, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
-; SSE2-NEXT: psrad $24, %xmm4
-; SSE2-NEXT: movdqa %xmm11, 16(%rsi)
-; SSE2-NEXT: movdqa %xmm8, (%rsi)
-; SSE2-NEXT: movdqa %xmm4, 64(%rdi)
-; SSE2-NEXT: movdqa %xmm7, (%rdi)
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: psrad $24, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: psrad $24, %xmm8
+; SSE2-NEXT: movdqa %xmm4, 16(%rsi)
+; SSE2-NEXT: movdqa %xmm2, (%rsi)
+; SSE2-NEXT: movdqa %xmm8, 64(%rdi)
+; SSE2-NEXT: movdqa %xmm5, (%rdi)
; SSE2-NEXT: movdqa %xmm1, 112(%rdi)
-; SSE2-NEXT: movdqa %xmm2, 96(%rdi)
-; SSE2-NEXT: movdqa %xmm6, 80(%rdi)
+; SSE2-NEXT: movdqa %xmm9, 96(%rdi)
+; SSE2-NEXT: movdqa %xmm7, 80(%rdi)
; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
-; SSE2-NEXT: movdqa %xmm5, 32(%rdi)
+; SSE2-NEXT: movdqa %xmm6, 32(%rdi)
; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
; SSE2-NEXT: retq
;
@@ -1337,118 +1337,118 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: movdqa %xmm0, %xmm6
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
; SSSE3-NEXT: pmullw %xmm4, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; SSSE3-NEXT: movdqa %xmm6, %xmm7
-; SSSE3-NEXT: pand %xmm11, %xmm7
+; SSSE3-NEXT: pand %xmm4, %xmm7
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
; SSSE3-NEXT: pmullw %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm8
-; SSSE3-NEXT: pand %xmm11, %xmm8
-; SSSE3-NEXT: packuswb %xmm7, %xmm8
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: packuswb %xmm7, %xmm2
; SSSE3-NEXT: movdqa %xmm3, %xmm7
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
-; SSSE3-NEXT: pmullw %xmm7, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm7
-; SSSE3-NEXT: pand %xmm11, %xmm7
+; SSSE3-NEXT: movdqa %xmm1, %xmm8
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
+; SSSE3-NEXT: pmullw %xmm7, %xmm8
+; SSSE3-NEXT: movdqa %xmm8, %xmm7
+; SSSE3-NEXT: pand %xmm4, %xmm7
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
; SSSE3-NEXT: pmullw %xmm3, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm11
-; SSSE3-NEXT: packuswb %xmm7, %xmm11
-; SSSE3-NEXT: psrlw $8, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm4
+; SSSE3-NEXT: packuswb %xmm7, %xmm4
+; SSSE3-NEXT: psrlw $8, %xmm8
; SSSE3-NEXT: psrlw $8, %xmm1
-; SSSE3-NEXT: packuswb %xmm2, %xmm1
+; SSSE3-NEXT: packuswb %xmm8, %xmm1
; SSSE3-NEXT: pcmpeqb %xmm5, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3
+; SSSE3-NEXT: pxor %xmm3, %xmm1
; SSSE3-NEXT: psrlw $8, %xmm6
; SSSE3-NEXT: psrlw $8, %xmm0
; SSSE3-NEXT: packuswb %xmm6, %xmm0
; SSSE3-NEXT: pcmpeqb %xmm5, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pslld $31, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: pslld $31, %xmm5
-; SSSE3-NEXT: psrad $31, %xmm5
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: pslld $31, %xmm6
+; SSSE3-NEXT: psrad $31, %xmm6
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pslld $31, %xmm3
; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: pslld $31, %xmm6
-; SSSE3-NEXT: psrad $31, %xmm6
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm7
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: pslld $31, %xmm7
+; SSSE3-NEXT: psrad $31, %xmm7
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: pslld $31, %xmm2
-; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: movdqa %xmm1, %xmm9
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: pslld $31, %xmm9
+; SSSE3-NEXT: psrad $31, %xmm9
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pslld $31, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; SSSE3-NEXT: psrad $24, %xmm7
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
-; SSSE3-NEXT: psrad $24, %xmm4
-; SSSE3-NEXT: movdqa %xmm11, 16(%rsi)
-; SSSE3-NEXT: movdqa %xmm8, (%rsi)
-; SSSE3-NEXT: movdqa %xmm4, 64(%rdi)
-; SSSE3-NEXT: movdqa %xmm7, (%rdi)
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm5
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm8
+; SSSE3-NEXT: movdqa %xmm4, 16(%rsi)
+; SSSE3-NEXT: movdqa %xmm2, (%rsi)
+; SSSE3-NEXT: movdqa %xmm8, 64(%rdi)
+; SSSE3-NEXT: movdqa %xmm5, (%rdi)
; SSSE3-NEXT: movdqa %xmm1, 112(%rdi)
-; SSSE3-NEXT: movdqa %xmm2, 96(%rdi)
-; SSSE3-NEXT: movdqa %xmm6, 80(%rdi)
+; SSSE3-NEXT: movdqa %xmm9, 96(%rdi)
+; SSSE3-NEXT: movdqa %xmm7, 80(%rdi)
; SSSE3-NEXT: movdqa %xmm3, 48(%rdi)
-; SSSE3-NEXT: movdqa %xmm5, 32(%rdi)
+; SSSE3-NEXT: movdqa %xmm6, 32(%rdi)
; SSSE3-NEXT: movdqa %xmm0, 16(%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: umulo_v32i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %rax
-; SSE41-NEXT: pxor %xmm8, %xmm8
+; SSE41-NEXT: pxor %xmm7, %xmm7
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
; SSE41-NEXT: pmullw %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pand %xmm10, %xmm6
+; SSE41-NEXT: pand %xmm2, %xmm6
; SSE41-NEXT: pmullw %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm9
-; SSE41-NEXT: pand %xmm10, %xmm9
-; SSE41-NEXT: packuswb %xmm6, %xmm9
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
+; SSE41-NEXT: movdqa %xmm4, %xmm5
+; SSE41-NEXT: pand %xmm2, %xmm5
+; SSE41-NEXT: packuswb %xmm6, %xmm5
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15]
; SSE41-NEXT: pmullw %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pand %xmm10, %xmm3
-; SSE41-NEXT: pmullw %xmm7, %xmm6
-; SSE41-NEXT: pand %xmm6, %xmm10
-; SSE41-NEXT: packuswb %xmm3, %xmm10
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: pmullw %xmm8, %xmm6
+; SSE41-NEXT: pand %xmm6, %xmm2
+; SSE41-NEXT: packuswb %xmm3, %xmm2
; SSE41-NEXT: psrlw $8, %xmm1
; SSE41-NEXT: psrlw $8, %xmm6
; SSE41-NEXT: packuswb %xmm1, %xmm6
-; SSE41-NEXT: pcmpeqb %xmm8, %xmm6
+; SSE41-NEXT: pcmpeqb %xmm7, %xmm6
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm1, %xmm6
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: psrlw $8, %xmm4
; SSE41-NEXT: packuswb %xmm0, %xmm4
-; SSE41-NEXT: pcmpeqb %xmm8, %xmm4
+; SSE41-NEXT: pcmpeqb %xmm7, %xmm4
; SSE41-NEXT: pxor %xmm1, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -1466,22 +1466,22 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm7
; SSE41-NEXT: psrad $31, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; SSE41-NEXT: pslld $31, %xmm5
-; SSE41-NEXT: psrad $31, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; SSE41-NEXT: pslld $31, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,3,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm8
+; SSE41-NEXT: psrad $31, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm9
+; SSE41-NEXT: psrad $31, %xmm9
; SSE41-NEXT: pmovsxbd %xmm4, %xmm4
; SSE41-NEXT: pmovsxbd %xmm6, %xmm6
-; SSE41-NEXT: movdqa %xmm10, 16(%rsi)
-; SSE41-NEXT: movdqa %xmm9, (%rsi)
+; SSE41-NEXT: movdqa %xmm2, 16(%rsi)
+; SSE41-NEXT: movdqa %xmm5, (%rsi)
; SSE41-NEXT: movdqa %xmm6, 64(%rdi)
; SSE41-NEXT: movdqa %xmm4, (%rdi)
-; SSE41-NEXT: movdqa %xmm2, 112(%rdi)
-; SSE41-NEXT: movdqa %xmm5, 96(%rdi)
+; SSE41-NEXT: movdqa %xmm9, 112(%rdi)
+; SSE41-NEXT: movdqa %xmm8, 96(%rdi)
; SSE41-NEXT: movdqa %xmm7, 80(%rdi)
; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
; SSE41-NEXT: movdqa %xmm1, 32(%rdi)
@@ -1500,24 +1500,24 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm7
-; AVX1-NEXT: vpackuswb %xmm4, %xmm7, %xmm8
+; AVX1-NEXT: vpackuswb %xmm4, %xmm7, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm7
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm5
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm1
+; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm7
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm3
; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
@@ -1527,8 +1527,8 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpmovsxbd %xmm4, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,1,1]
+; AVX1-NEXT: vpmovsxbd %xmm7, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
@@ -1536,13 +1536,13 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
-; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
+; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm8, (%rdi)
+; AVX1-NEXT: vmovdqa %xmm4, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: umulo_v32i8:
@@ -1629,61 +1629,61 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
; SSE2-NEXT: pmullw %xmm8, %xmm10
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm10, %xmm12
-; SSE2-NEXT: pand %xmm8, %xmm12
+; SSE2-NEXT: movdqa %xmm10, %xmm11
+; SSE2-NEXT: pand %xmm8, %xmm11
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
; SSE2-NEXT: pmullw %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm11
-; SSE2-NEXT: pand %xmm8, %xmm11
-; SSE2-NEXT: packuswb %xmm12, %xmm11
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
-; SSE2-NEXT: movdqa %xmm1, %xmm13
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
-; SSE2-NEXT: pmullw %xmm4, %xmm13
-; SSE2-NEXT: movdqa %xmm13, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: packuswb %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm12
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
+; SSE2-NEXT: pmullw %xmm12, %xmm11
+; SSE2-NEXT: movdqa %xmm11, %xmm12
+; SSE2-NEXT: pand %xmm8, %xmm12
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
; SSE2-NEXT: pmullw %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm12
-; SSE2-NEXT: pand %xmm8, %xmm12
-; SSE2-NEXT: packuswb %xmm4, %xmm12
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
-; SSE2-NEXT: pmullw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: packuswb %xmm12, %xmm5
+; SSE2-NEXT: movdqa %xmm6, %xmm13
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
+; SSE2-NEXT: movdqa %xmm2, %xmm12
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
+; SSE2-NEXT: pmullw %xmm13, %xmm12
+; SSE2-NEXT: movdqa %xmm12, %xmm13
+; SSE2-NEXT: pand %xmm8, %xmm13
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
; SSE2-NEXT: pmullw %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm14
-; SSE2-NEXT: pand %xmm8, %xmm14
-; SSE2-NEXT: packuswb %xmm4, %xmm14
-; SSE2-NEXT: movdqa %xmm7, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
-; SSE2-NEXT: pmullw %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: packuswb %xmm13, %xmm6
+; SSE2-NEXT: movdqa %xmm7, %xmm13
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm14
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15]
+; SSE2-NEXT: pmullw %xmm13, %xmm14
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
; SSE2-NEXT: pmullw %xmm7, %xmm3
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm14, %xmm7
+; SSE2-NEXT: pand %xmm8, %xmm7
; SSE2-NEXT: pand %xmm3, %xmm8
-; SSE2-NEXT: packuswb %xmm4, %xmm8
-; SSE2-NEXT: psrlw $8, %xmm6
+; SSE2-NEXT: packuswb %xmm7, %xmm8
+; SSE2-NEXT: psrlw $8, %xmm14
; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: packuswb %xmm6, %xmm3
-; SSE2-NEXT: psrlw $8, %xmm5
+; SSE2-NEXT: packuswb %xmm14, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm12
; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: packuswb %xmm5, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm13
+; SSE2-NEXT: packuswb %xmm12, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm11
; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: packuswb %xmm13, %xmm1
+; SSE2-NEXT: packuswb %xmm11, %xmm1
; SSE2-NEXT: psrlw $8, %xmm10
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm10, %xmm0
@@ -1691,42 +1691,42 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; SSE2-NEXT: pcmpeqb %xmm9, %xmm2
; SSE2-NEXT: pcmpeqb %xmm9, %xmm1
; SSE2-NEXT: pcmpeqb %xmm9, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT: pxor %xmm7, %xmm3
+; SSE2-NEXT: pxor %xmm7, %xmm2
+; SSE2-NEXT: pxor %xmm7, %xmm1
+; SSE2-NEXT: pxor %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm8, 48(%rsi)
-; SSE2-NEXT: movdqa %xmm14, 32(%rsi)
-; SSE2-NEXT: movdqa %xmm12, 16(%rsi)
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa %xmm11, (%rsi)
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm5
-; SSE2-NEXT: movdqa %xmm5, 192(%rdi)
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm5
-; SSE2-NEXT: movdqa %xmm5, 128(%rdi)
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm5
-; SSE2-NEXT: movdqa %xmm5, 64(%rdi)
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm5
-; SSE2-NEXT: movdqa %xmm5, (%rdi)
-; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm6, 32(%rsi)
+; SSE2-NEXT: movdqa %xmm5, 16(%rsi)
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm4, (%rsi)
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pslld $31, %xmm4
-; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: movdqa %xmm4, 224(%rdi)
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrad $24, %xmm4
+; SSE2-NEXT: movdqa %xmm4, 192(%rdi)
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: psrad $24, %xmm4
+; SSE2-NEXT: movdqa %xmm4, 128(%rdi)
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: psrad $24, %xmm4
+; SSE2-NEXT: movdqa %xmm4, 64(%rdi)
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: psrad $24, %xmm4
+; SSE2-NEXT: movdqa %xmm4, (%rdi)
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $31, %xmm5
; SSE2-NEXT: psrad $31, %xmm5
-; SSE2-NEXT: movdqa %xmm5, 240(%rdi)
+; SSE2-NEXT: movdqa %xmm5, 224(%rdi)
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: movdqa %xmm4, 240(%rdi)
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1792,61 +1792,61 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
; SSSE3-NEXT: pmullw %xmm8, %xmm10
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; SSSE3-NEXT: movdqa %xmm10, %xmm12
-; SSSE3-NEXT: pand %xmm8, %xmm12
+; SSSE3-NEXT: movdqa %xmm10, %xmm11
+; SSSE3-NEXT: pand %xmm8, %xmm11
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
; SSSE3-NEXT: pmullw %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm11
-; SSSE3-NEXT: pand %xmm8, %xmm11
-; SSSE3-NEXT: packuswb %xmm12, %xmm11
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
-; SSSE3-NEXT: movdqa %xmm1, %xmm13
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
-; SSSE3-NEXT: pmullw %xmm4, %xmm13
-; SSSE3-NEXT: movdqa %xmm13, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pand %xmm8, %xmm4
+; SSSE3-NEXT: packuswb %xmm11, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm12
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
+; SSSE3-NEXT: movdqa %xmm1, %xmm11
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
+; SSSE3-NEXT: pmullw %xmm12, %xmm11
+; SSSE3-NEXT: movdqa %xmm11, %xmm12
+; SSSE3-NEXT: pand %xmm8, %xmm12
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
; SSSE3-NEXT: pmullw %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm12
-; SSSE3-NEXT: pand %xmm8, %xmm12
-; SSSE3-NEXT: packuswb %xmm4, %xmm12
-; SSSE3-NEXT: movdqa %xmm6, %xmm4
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
-; SSSE3-NEXT: pmullw %xmm4, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pand %xmm8, %xmm4
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: packuswb %xmm12, %xmm5
+; SSSE3-NEXT: movdqa %xmm6, %xmm13
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
+; SSSE3-NEXT: movdqa %xmm2, %xmm12
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
+; SSSE3-NEXT: pmullw %xmm13, %xmm12
+; SSSE3-NEXT: movdqa %xmm12, %xmm13
+; SSSE3-NEXT: pand %xmm8, %xmm13
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
; SSSE3-NEXT: pmullw %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm14
-; SSSE3-NEXT: pand %xmm8, %xmm14
-; SSSE3-NEXT: packuswb %xmm4, %xmm14
-; SSSE3-NEXT: movdqa %xmm7, %xmm4
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
-; SSSE3-NEXT: movdqa %xmm3, %xmm6
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
-; SSSE3-NEXT: pmullw %xmm4, %xmm6
+; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: pand %xmm8, %xmm6
+; SSSE3-NEXT: packuswb %xmm13, %xmm6
+; SSSE3-NEXT: movdqa %xmm7, %xmm13
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
+; SSSE3-NEXT: movdqa %xmm3, %xmm14
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15]
+; SSSE3-NEXT: pmullw %xmm13, %xmm14
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
; SSSE3-NEXT: pmullw %xmm7, %xmm3
-; SSSE3-NEXT: movdqa %xmm6, %xmm4
-; SSSE3-NEXT: pand %xmm8, %xmm4
+; SSSE3-NEXT: movdqa %xmm14, %xmm7
+; SSSE3-NEXT: pand %xmm8, %xmm7
; SSSE3-NEXT: pand %xmm3, %xmm8
-; SSSE3-NEXT: packuswb %xmm4, %xmm8
-; SSSE3-NEXT: psrlw $8, %xmm6
+; SSSE3-NEXT: packuswb %xmm7, %xmm8
+; SSSE3-NEXT: psrlw $8, %xmm14
; SSSE3-NEXT: psrlw $8, %xmm3
-; SSSE3-NEXT: packuswb %xmm6, %xmm3
-; SSSE3-NEXT: psrlw $8, %xmm5
+; SSSE3-NEXT: packuswb %xmm14, %xmm3
+; SSSE3-NEXT: psrlw $8, %xmm12
; SSSE3-NEXT: psrlw $8, %xmm2
-; SSSE3-NEXT: packuswb %xmm5, %xmm2
-; SSSE3-NEXT: psrlw $8, %xmm13
+; SSSE3-NEXT: packuswb %xmm12, %xmm2
+; SSSE3-NEXT: psrlw $8, %xmm11
; SSSE3-NEXT: psrlw $8, %xmm1
-; SSSE3-NEXT: packuswb %xmm13, %xmm1
+; SSSE3-NEXT: packuswb %xmm11, %xmm1
; SSSE3-NEXT: psrlw $8, %xmm10
; SSSE3-NEXT: psrlw $8, %xmm0
; SSSE3-NEXT: packuswb %xmm10, %xmm0
@@ -1854,42 +1854,42 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: pcmpeqb %xmm9, %xmm2
; SSSE3-NEXT: pcmpeqb %xmm9, %xmm1
; SSSE3-NEXT: pcmpeqb %xmm9, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7
+; SSSE3-NEXT: pxor %xmm7, %xmm3
+; SSSE3-NEXT: pxor %xmm7, %xmm2
+; SSSE3-NEXT: pxor %xmm7, %xmm1
+; SSSE3-NEXT: pxor %xmm7, %xmm0
; SSSE3-NEXT: movdqa %xmm8, 48(%rsi)
-; SSSE3-NEXT: movdqa %xmm14, 32(%rsi)
-; SSSE3-NEXT: movdqa %xmm12, 16(%rsi)
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT: movdqa %xmm11, (%rsi)
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $24, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, 192(%rdi)
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $24, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, 128(%rdi)
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $24, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, 64(%rdi)
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $24, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, (%rdi)
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: movdqa %xmm6, 32(%rsi)
+; SSSE3-NEXT: movdqa %xmm5, 16(%rsi)
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT: movdqa %xmm4, (%rsi)
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: pslld $31, %xmm4
-; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, 224(%rdi)
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: psrad $24, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, 192(%rdi)
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, 128(%rdi)
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, 64(%rdi)
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, (%rdi)
+; SSSE3-NEXT: movdqa %xmm5, %xmm4
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: pslld $31, %xmm5
; SSSE3-NEXT: psrad $31, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, 240(%rdi)
+; SSSE3-NEXT: movdqa %xmm5, 224(%rdi)
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: pslld $31, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, 240(%rdi)
; SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -2094,94 +2094,94 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm8
+; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm11
-; AVX1-NEXT: vpand %xmm6, %xmm11, %xmm4
-; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm8
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm8
+; AVX1-NEXT: vpackuswb %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm12
-; AVX1-NEXT: vpand %xmm6, %xmm12, %xmm7
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
+; AVX1-NEXT: vpmullw %xmm8, %xmm10, %xmm8
+; AVX1-NEXT: vpand %xmm9, %xmm8, %xmm11
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm13
-; AVX1-NEXT: vpand %xmm6, %xmm13, %xmm2
-; AVX1-NEXT: vpackuswb %xmm7, %xmm2, %xmm10
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm10
+; AVX1-NEXT: vpand %xmm9, %xmm10, %xmm0
+; AVX1-NEXT: vpackuswb %xmm11, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
-; AVX1-NEXT: vpmullw %xmm2, %xmm7, %xmm7
-; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm4
-; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm14
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
+; AVX1-NEXT: vpmullw %xmm2, %xmm11, %xmm11
+; AVX1-NEXT: vpand %xmm9, %xmm11, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmullw %xmm12, %xmm13, %xmm12
+; AVX1-NEXT: vpand %xmm9, %xmm12, %xmm13
+; AVX1-NEXT: vpackuswb %xmm2, %xmm13, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
+; AVX1-NEXT: vpmullw %xmm13, %xmm14, %xmm13
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm1
-; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm4
-; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm15
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm9, %xmm13, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm9
+; AVX1-NEXT: vpackuswb %xmm1, %xmm9, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm9
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm12, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm4
-; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm6
-; AVX1-NEXT: vpackuswb %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm9
+; AVX1-NEXT: vpsrlw $8, %xmm12, %xmm11
+; AVX1-NEXT: vpackuswb %xmm9, %xmm11, %xmm9
+; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8
+; AVX1-NEXT: vpsrlw $8, %xmm10, %xmm10
+; AVX1-NEXT: vpackuswb %xmm8, %xmm10, %xmm8
+; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
+; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm2, %xmm6
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm4
-; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm5
-; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm3
-; AVX1-NEXT: vmovdqa %xmm15, 48(%rsi)
-; AVX1-NEXT: vmovdqa %xmm14, 32(%rsi)
-; AVX1-NEXT: vmovdqa %xmm10, 16(%rsi)
-; AVX1-NEXT: vmovdqa %xmm8, (%rsi)
-; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm9, %xmm9
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm8, %xmm8
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm10
+; AVX1-NEXT: vpcmpeqd %xmm11, %xmm11, %xmm11
+; AVX1-NEXT: vpxor %xmm3, %xmm11, %xmm7
+; AVX1-NEXT: vpxor %xmm11, %xmm9, %xmm6
+; AVX1-NEXT: vpxor %xmm11, %xmm8, %xmm5
+; AVX1-NEXT: vpxor %xmm11, %xmm10, %xmm3
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rsi)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rsi)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi)
+; AVX1-NEXT: vmovdqa %xmm4, (%rsi)
+; AVX1-NEXT: vpmovsxbd %xmm7, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, 192(%rdi)
-; AVX1-NEXT: vpmovsxbd %xmm4, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, 128(%rdi)
; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi)
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, 240(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, 208(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, 176(%rdi)
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, 144(%rdi)
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
@@ -2218,7 +2218,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23]
; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm0
-; AVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm9
+; AVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm0
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
; AVX2-NEXT: vpmullw %ymm7, %ymm8, %ymm7
@@ -2227,7 +2227,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm6, %ymm1, %ymm3
-; AVX2-NEXT: vpackuswb %ymm8, %ymm3, %ymm8
+; AVX2-NEXT: vpackuswb %ymm8, %ymm3, %ymm3
; AVX2-NEXT: vpsrlw $8, %ymm7, %ymm6
; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2-NEXT: vpackuswb %ymm6, %ymm1, %ymm1
@@ -2246,20 +2246,20 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6
; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8
+; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,3,2,3]
+; AVX2-NEXT: vpmovsxbd %xmm9, %ymm9
; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2
; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm8, 32(%rsi)
-; AVX2-NEXT: vmovdqa %ymm9, (%rsi)
-; AVX2-NEXT: vmovdqa %ymm0, 192(%rdi)
+; AVX2-NEXT: vpmovsxbd %xmm8, %ymm8
+; AVX2-NEXT: vmovdqa %ymm3, 32(%rsi)
+; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX2-NEXT: vmovdqa %ymm8, 192(%rdi)
; AVX2-NEXT: vmovdqa %ymm1, 128(%rdi)
; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
-; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi)
+; AVX2-NEXT: vmovdqa %ymm9, 224(%rdi)
; AVX2-NEXT: vmovdqa %ymm7, 160(%rdi)
; AVX2-NEXT: vmovdqa %ymm6, 96(%rdi)
; AVX2-NEXT: vmovdqa %ymm4, 32(%rdi)
@@ -2459,24 +2459,24 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; SSE2-LABEL: umulo_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm2, %r8
+; SSE2-NEXT: movq %xmm2, %rcx
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE2-NEXT: movq %xmm2, %r10
+; SSE2-NEXT: movq %xmm2, %rsi
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: movq %xmm1, %rdx
-; SSE2-NEXT: xorl %ecx, %ecx
+; SSE2-NEXT: xorl %r8d, %r8d
; SSE2-NEXT: mulq %rdx
; SSE2-NEXT: movq $-1, %r9
-; SSE2-NEXT: movl $0, %esi
-; SSE2-NEXT: cmovoq %r9, %rsi
+; SSE2-NEXT: movl $0, %r10d
+; SSE2-NEXT: cmovoq %r9, %r10
; SSE2-NEXT: movq %rax, %xmm1
-; SSE2-NEXT: movq %r8, %rax
-; SSE2-NEXT: mulq %r10
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rsi
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movq %rsi, %xmm0
-; SSE2-NEXT: cmovoq %r9, %rcx
-; SSE2-NEXT: movq %rcx, %xmm2
+; SSE2-NEXT: movq %r10, %xmm0
+; SSE2-NEXT: cmovoq %r9, %r8
+; SSE2-NEXT: movq %r8, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: movdqa %xmm1, (%rdi)
@@ -2485,24 +2485,24 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; SSSE3-LABEL: umulo_v2i64:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSSE3-NEXT: movq %xmm2, %r8
+; SSSE3-NEXT: movq %xmm2, %rcx
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSSE3-NEXT: movq %xmm2, %r10
+; SSSE3-NEXT: movq %xmm2, %rsi
; SSSE3-NEXT: movq %xmm0, %rax
; SSSE3-NEXT: movq %xmm1, %rdx
-; SSSE3-NEXT: xorl %ecx, %ecx
+; SSSE3-NEXT: xorl %r8d, %r8d
; SSSE3-NEXT: mulq %rdx
; SSSE3-NEXT: movq $-1, %r9
-; SSSE3-NEXT: movl $0, %esi
-; SSSE3-NEXT: cmovoq %r9, %rsi
+; SSSE3-NEXT: movl $0, %r10d
+; SSSE3-NEXT: cmovoq %r9, %r10
; SSSE3-NEXT: movq %rax, %xmm1
-; SSSE3-NEXT: movq %r8, %rax
-; SSSE3-NEXT: mulq %r10
+; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: mulq %rsi
; SSSE3-NEXT: movq %rax, %xmm0
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: movq %rsi, %xmm0
-; SSSE3-NEXT: cmovoq %r9, %rcx
-; SSSE3-NEXT: movq %rcx, %xmm2
+; SSSE3-NEXT: movq %r10, %xmm0
+; SSSE3-NEXT: cmovoq %r9, %r8
+; SSSE3-NEXT: movq %r8, %xmm2
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: movdqa %xmm1, (%rdi)
@@ -2510,23 +2510,23 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
;
; SSE41-LABEL: umulo_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq %xmm0, %r10
-; SSE41-NEXT: movq %xmm1, %r8
+; SSE41-NEXT: movq %xmm0, %rcx
+; SSE41-NEXT: movq %xmm1, %rsi
; SSE41-NEXT: pextrq $1, %xmm0, %rax
; SSE41-NEXT: pextrq $1, %xmm1, %rdx
-; SSE41-NEXT: xorl %esi, %esi
+; SSE41-NEXT: xorl %r8d, %r8d
; SSE41-NEXT: mulq %rdx
; SSE41-NEXT: movq $-1, %r9
-; SSE41-NEXT: movl $0, %ecx
-; SSE41-NEXT: cmovoq %r9, %rcx
+; SSE41-NEXT: movl $0, %r10d
+; SSE41-NEXT: cmovoq %r9, %r10
; SSE41-NEXT: movq %rax, %xmm0
-; SSE41-NEXT: movq %r10, %rax
-; SSE41-NEXT: mulq %r8
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: movq %rax, %xmm1
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE41-NEXT: movq %rcx, %xmm0
-; SSE41-NEXT: cmovoq %r9, %rsi
-; SSE41-NEXT: movq %rsi, %xmm2
+; SSE41-NEXT: movq %r10, %xmm0
+; SSE41-NEXT: cmovoq %r9, %r8
+; SSE41-NEXT: movq %r8, %xmm2
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE41-NEXT: movdqa %xmm1, (%rdi)
@@ -2534,23 +2534,23 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
;
; AVX-LABEL: umulo_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovq %xmm0, %r10
-; AVX-NEXT: vmovq %xmm1, %r8
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: vmovq %xmm1, %rsi
; AVX-NEXT: vpextrq $1, %xmm0, %rax
; AVX-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX-NEXT: xorl %esi, %esi
+; AVX-NEXT: xorl %r8d, %r8d
; AVX-NEXT: mulq %rdx
; AVX-NEXT: movq $-1, %r9
-; AVX-NEXT: movl $0, %ecx
-; AVX-NEXT: cmovoq %r9, %rcx
+; AVX-NEXT: movl $0, %r10d
+; AVX-NEXT: cmovoq %r9, %r10
; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: movq %r10, %rax
-; AVX-NEXT: mulq %r8
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
; AVX-NEXT: vmovq %rax, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX-NEXT: vmovq %rcx, %xmm0
-; AVX-NEXT: cmovoq %r9, %rsi
-; AVX-NEXT: vmovq %rsi, %xmm2
+; AVX-NEXT: vmovq %r10, %xmm0
+; AVX-NEXT: cmovoq %r9, %r8
+; AVX-NEXT: vmovq %r8, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vmovdqa %xmm1, (%rdi)
@@ -2907,14 +2907,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %r15
; SSE2-NEXT: pushq %r14
-; SSE2-NEXT: pushq %r13
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: movq %r9, %r10
-; SSE2-NEXT: movq %rcx, %r12
-; SSE2-NEXT: movq %rdx, %r11
+; SSE2-NEXT: movq %rcx, %r11
+; SSE2-NEXT: movq %rdx, %rcx
; SSE2-NEXT: movq %rsi, %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9
; SSE2-NEXT: testq %r10, %r10
@@ -2924,54 +2923,53 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: andb %dl, %bpl
; SSE2-NEXT: mulq %r8
; SSE2-NEXT: movq %rax, %rsi
-; SSE2-NEXT: seto %bl
+; SSE2-NEXT: seto %r15b
; SSE2-NEXT: movq %r10, %rax
; SSE2-NEXT: mulq %rdi
-; SSE2-NEXT: seto %cl
-; SSE2-NEXT: orb %bl, %cl
-; SSE2-NEXT: leaq (%rsi,%rax), %rbx
+; SSE2-NEXT: seto %r12b
+; SSE2-NEXT: orb %r15b, %r12b
+; SSE2-NEXT: leaq (%rsi,%rax), %r10
; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: mulq %r8
; SSE2-NEXT: movq %rax, %rdi
; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: addq %rbx, %rsi
-; SSE2-NEXT: setb %r13b
-; SSE2-NEXT: orb %cl, %r13b
-; SSE2-NEXT: orb %bpl, %r13b
+; SSE2-NEXT: addq %r10, %rsi
+; SSE2-NEXT: setb %r10b
+; SSE2-NEXT: orb %r12b, %r10b
+; SSE2-NEXT: orb %bpl, %r10b
; SSE2-NEXT: testq %r9, %r9
; SSE2-NEXT: setne %al
-; SSE2-NEXT: testq %r12, %r12
-; SSE2-NEXT: setne %r10b
-; SSE2-NEXT: andb %al, %r10b
-; SSE2-NEXT: movq %r12, %rax
+; SSE2-NEXT: testq %r11, %r11
+; SSE2-NEXT: setne %bpl
+; SSE2-NEXT: andb %al, %bpl
+; SSE2-NEXT: movq %r11, %rax
; SSE2-NEXT: mulq %r14
-; SSE2-NEXT: movq %rax, %rbp
-; SSE2-NEXT: seto %r8b
+; SSE2-NEXT: movq %rax, %r8
+; SSE2-NEXT: seto %r11b
; SSE2-NEXT: movq %r9, %rax
-; SSE2-NEXT: mulq %r11
-; SSE2-NEXT: seto %cl
-; SSE2-NEXT: orb %r8b, %cl
-; SSE2-NEXT: addq %rax, %rbp
-; SSE2-NEXT: movq %r11, %rax
+; SSE2-NEXT: mulq %rcx
+; SSE2-NEXT: seto %r9b
+; SSE2-NEXT: orb %r11b, %r9b
+; SSE2-NEXT: addq %rax, %r8
+; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %r14
-; SSE2-NEXT: addq %rbp, %rdx
-; SSE2-NEXT: setb %bl
-; SSE2-NEXT: orb %cl, %bl
-; SSE2-NEXT: orb %r10b, %bl
-; SSE2-NEXT: movzbl %bl, %ecx
+; SSE2-NEXT: addq %r8, %rdx
+; SSE2-NEXT: setb %cl
+; SSE2-NEXT: orb %r9b, %cl
+; SSE2-NEXT: orb %bpl, %cl
+; SSE2-NEXT: movzbl %cl, %ecx
; SSE2-NEXT: negl %ecx
; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: movzbl %r13b, %ecx
+; SSE2-NEXT: movzbl %r10b, %ecx
; SSE2-NEXT: negl %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %rax, 16(%r15)
-; SSE2-NEXT: movq %rdi, (%r15)
-; SSE2-NEXT: movq %rdx, 24(%r15)
-; SSE2-NEXT: movq %rsi, 8(%r15)
+; SSE2-NEXT: movq %rax, 16(%rbx)
+; SSE2-NEXT: movq %rdi, (%rbx)
+; SSE2-NEXT: movq %rdx, 24(%rbx)
+; SSE2-NEXT: movq %rsi, 8(%rbx)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
-; SSE2-NEXT: popq %r13
; SSE2-NEXT: popq %r14
; SSE2-NEXT: popq %r15
; SSE2-NEXT: popq %rbp
@@ -2982,14 +2980,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: pushq %rbp
; SSSE3-NEXT: pushq %r15
; SSSE3-NEXT: pushq %r14
-; SSSE3-NEXT: pushq %r13
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
; SSSE3-NEXT: movq %r9, %r10
-; SSSE3-NEXT: movq %rcx, %r12
-; SSSE3-NEXT: movq %rdx, %r11
+; SSSE3-NEXT: movq %rcx, %r11
+; SSSE3-NEXT: movq %rdx, %rcx
; SSSE3-NEXT: movq %rsi, %rax
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9
; SSSE3-NEXT: testq %r10, %r10
@@ -2999,54 +2996,53 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: andb %dl, %bpl
; SSSE3-NEXT: mulq %r8
; SSSE3-NEXT: movq %rax, %rsi
-; SSSE3-NEXT: seto %bl
+; SSSE3-NEXT: seto %r15b
; SSSE3-NEXT: movq %r10, %rax
; SSSE3-NEXT: mulq %rdi
-; SSSE3-NEXT: seto %cl
-; SSSE3-NEXT: orb %bl, %cl
-; SSSE3-NEXT: leaq (%rsi,%rax), %rbx
+; SSSE3-NEXT: seto %r12b
+; SSSE3-NEXT: orb %r15b, %r12b
+; SSSE3-NEXT: leaq (%rsi,%rax), %r10
; SSSE3-NEXT: movq %rdi, %rax
; SSSE3-NEXT: mulq %r8
; SSSE3-NEXT: movq %rax, %rdi
; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: addq %rbx, %rsi
-; SSSE3-NEXT: setb %r13b
-; SSSE3-NEXT: orb %cl, %r13b
-; SSSE3-NEXT: orb %bpl, %r13b
+; SSSE3-NEXT: addq %r10, %rsi
+; SSSE3-NEXT: setb %r10b
+; SSSE3-NEXT: orb %r12b, %r10b
+; SSSE3-NEXT: orb %bpl, %r10b
; SSSE3-NEXT: testq %r9, %r9
; SSSE3-NEXT: setne %al
-; SSSE3-NEXT: testq %r12, %r12
-; SSSE3-NEXT: setne %r10b
-; SSSE3-NEXT: andb %al, %r10b
-; SSSE3-NEXT: movq %r12, %rax
+; SSSE3-NEXT: testq %r11, %r11
+; SSSE3-NEXT: setne %bpl
+; SSSE3-NEXT: andb %al, %bpl
+; SSSE3-NEXT: movq %r11, %rax
; SSSE3-NEXT: mulq %r14
-; SSSE3-NEXT: movq %rax, %rbp
-; SSSE3-NEXT: seto %r8b
+; SSSE3-NEXT: movq %rax, %r8
+; SSSE3-NEXT: seto %r11b
; SSSE3-NEXT: movq %r9, %rax
-; SSSE3-NEXT: mulq %r11
-; SSSE3-NEXT: seto %cl
-; SSSE3-NEXT: orb %r8b, %cl
-; SSSE3-NEXT: addq %rax, %rbp
-; SSSE3-NEXT: movq %r11, %rax
+; SSSE3-NEXT: mulq %rcx
+; SSSE3-NEXT: seto %r9b
+; SSSE3-NEXT: orb %r11b, %r9b
+; SSSE3-NEXT: addq %rax, %r8
+; SSSE3-NEXT: movq %rcx, %rax
; SSSE3-NEXT: mulq %r14
-; SSSE3-NEXT: addq %rbp, %rdx
-; SSSE3-NEXT: setb %bl
-; SSSE3-NEXT: orb %cl, %bl
-; SSSE3-NEXT: orb %r10b, %bl
-; SSSE3-NEXT: movzbl %bl, %ecx
+; SSSE3-NEXT: addq %r8, %rdx
+; SSSE3-NEXT: setb %cl
+; SSSE3-NEXT: orb %r9b, %cl
+; SSSE3-NEXT: orb %bpl, %cl
+; SSSE3-NEXT: movzbl %cl, %ecx
; SSSE3-NEXT: negl %ecx
; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: movzbl %r13b, %ecx
+; SSSE3-NEXT: movzbl %r10b, %ecx
; SSSE3-NEXT: negl %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %rax, 16(%r15)
-; SSSE3-NEXT: movq %rdi, (%r15)
-; SSSE3-NEXT: movq %rdx, 24(%r15)
-; SSSE3-NEXT: movq %rsi, 8(%r15)
+; SSSE3-NEXT: movq %rax, 16(%rbx)
+; SSSE3-NEXT: movq %rdi, (%rbx)
+; SSSE3-NEXT: movq %rdx, 24(%rbx)
+; SSSE3-NEXT: movq %rsi, 8(%rbx)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
-; SSSE3-NEXT: popq %r13
; SSSE3-NEXT: popq %r14
; SSSE3-NEXT: popq %r15
; SSSE3-NEXT: popq %rbp
@@ -3057,14 +3053,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: pushq %rbp
; SSE41-NEXT: pushq %r15
; SSE41-NEXT: pushq %r14
-; SSE41-NEXT: pushq %r13
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
; SSE41-NEXT: movq %r9, %r10
-; SSE41-NEXT: movq %rcx, %r12
-; SSE41-NEXT: movq %rdx, %r11
+; SSE41-NEXT: movq %rcx, %r11
+; SSE41-NEXT: movq %rdx, %rcx
; SSE41-NEXT: movq %rsi, %rax
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9
; SSE41-NEXT: testq %r10, %r10
@@ -3074,53 +3069,52 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: andb %dl, %bpl
; SSE41-NEXT: mulq %r8
; SSE41-NEXT: movq %rax, %rsi
-; SSE41-NEXT: seto %bl
+; SSE41-NEXT: seto %r15b
; SSE41-NEXT: movq %r10, %rax
; SSE41-NEXT: mulq %rdi
-; SSE41-NEXT: seto %cl
-; SSE41-NEXT: orb %bl, %cl
-; SSE41-NEXT: leaq (%rsi,%rax), %rbx
+; SSE41-NEXT: seto %r12b
+; SSE41-NEXT: orb %r15b, %r12b
+; SSE41-NEXT: leaq (%rsi,%rax), %r10
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: mulq %r8
; SSE41-NEXT: movq %rax, %rdi
; SSE41-NEXT: movq %rdx, %rsi
-; SSE41-NEXT: addq %rbx, %rsi
-; SSE41-NEXT: setb %r13b
-; SSE41-NEXT: orb %cl, %r13b
-; SSE41-NEXT: orb %bpl, %r13b
+; SSE41-NEXT: addq %r10, %rsi
+; SSE41-NEXT: setb %r10b
+; SSE41-NEXT: orb %r12b, %r10b
+; SSE41-NEXT: orb %bpl, %r10b
; SSE41-NEXT: testq %r9, %r9
; SSE41-NEXT: setne %al
-; SSE41-NEXT: testq %r12, %r12
-; SSE41-NEXT: setne %r10b
-; SSE41-NEXT: andb %al, %r10b
-; SSE41-NEXT: movq %r12, %rax
+; SSE41-NEXT: testq %r11, %r11
+; SSE41-NEXT: setne %bpl
+; SSE41-NEXT: andb %al, %bpl
+; SSE41-NEXT: movq %r11, %rax
; SSE41-NEXT: mulq %r14
-; SSE41-NEXT: movq %rax, %rbp
-; SSE41-NEXT: seto %r8b
+; SSE41-NEXT: movq %rax, %r8
+; SSE41-NEXT: seto %r11b
; SSE41-NEXT: movq %r9, %rax
-; SSE41-NEXT: mulq %r11
-; SSE41-NEXT: seto %cl
-; SSE41-NEXT: orb %r8b, %cl
-; SSE41-NEXT: addq %rax, %rbp
-; SSE41-NEXT: movq %r11, %rax
+; SSE41-NEXT: mulq %rcx
+; SSE41-NEXT: seto %r9b
+; SSE41-NEXT: orb %r11b, %r9b
+; SSE41-NEXT: addq %rax, %r8
+; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %r14
-; SSE41-NEXT: addq %rbp, %rdx
-; SSE41-NEXT: setb %bl
-; SSE41-NEXT: orb %cl, %bl
-; SSE41-NEXT: orb %r10b, %bl
-; SSE41-NEXT: movzbl %bl, %ecx
+; SSE41-NEXT: addq %r8, %rdx
+; SSE41-NEXT: setb %cl
+; SSE41-NEXT: orb %r9b, %cl
+; SSE41-NEXT: orb %bpl, %cl
+; SSE41-NEXT: movzbl %cl, %ecx
; SSE41-NEXT: negl %ecx
-; SSE41-NEXT: movzbl %r13b, %ebp
-; SSE41-NEXT: negl %ebp
-; SSE41-NEXT: movd %ebp, %xmm0
+; SSE41-NEXT: movzbl %r10b, %r8d
+; SSE41-NEXT: negl %r8d
+; SSE41-NEXT: movd %r8d, %xmm0
; SSE41-NEXT: pinsrd $1, %ecx, %xmm0
-; SSE41-NEXT: movq %rax, 16(%r15)
-; SSE41-NEXT: movq %rdi, (%r15)
-; SSE41-NEXT: movq %rdx, 24(%r15)
-; SSE41-NEXT: movq %rsi, 8(%r15)
+; SSE41-NEXT: movq %rax, 16(%rbx)
+; SSE41-NEXT: movq %rdi, (%rbx)
+; SSE41-NEXT: movq %rdx, 24(%rbx)
+; SSE41-NEXT: movq %rsi, 8(%rbx)
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
-; SSE41-NEXT: popq %r13
; SSE41-NEXT: popq %r14
; SSE41-NEXT: popq %r15
; SSE41-NEXT: popq %rbp
@@ -3131,14 +3125,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: pushq %rbp
; AVX-NEXT: pushq %r15
; AVX-NEXT: pushq %r14
-; AVX-NEXT: pushq %r13
; AVX-NEXT: pushq %r12
; AVX-NEXT: pushq %rbx
; AVX-NEXT: movq %r9, %r10
-; AVX-NEXT: movq %rcx, %r12
-; AVX-NEXT: movq %rdx, %r11
+; AVX-NEXT: movq %rcx, %r11
+; AVX-NEXT: movq %rdx, %rcx
; AVX-NEXT: movq %rsi, %rax
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9
; AVX-NEXT: testq %r10, %r10
@@ -3148,53 +3141,52 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: andb %dl, %bpl
; AVX-NEXT: mulq %r8
; AVX-NEXT: movq %rax, %rsi
-; AVX-NEXT: seto %bl
+; AVX-NEXT: seto %r15b
; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %rdi
-; AVX-NEXT: seto %cl
-; AVX-NEXT: orb %bl, %cl
-; AVX-NEXT: leaq (%rsi,%rax), %rbx
+; AVX-NEXT: seto %r12b
+; AVX-NEXT: orb %r15b, %r12b
+; AVX-NEXT: leaq (%rsi,%rax), %r10
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: mulq %r8
; AVX-NEXT: movq %rax, %rdi
; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: addq %rbx, %rsi
-; AVX-NEXT: setb %r13b
-; AVX-NEXT: orb %cl, %r13b
-; AVX-NEXT: orb %bpl, %r13b
+; AVX-NEXT: addq %r10, %rsi
+; AVX-NEXT: setb %r10b
+; AVX-NEXT: orb %r12b, %r10b
+; AVX-NEXT: orb %bpl, %r10b
; AVX-NEXT: testq %r9, %r9
; AVX-NEXT: setne %al
-; AVX-NEXT: testq %r12, %r12
-; AVX-NEXT: setne %r10b
-; AVX-NEXT: andb %al, %r10b
-; AVX-NEXT: movq %r12, %rax
+; AVX-NEXT: testq %r11, %r11
+; AVX-NEXT: setne %bpl
+; AVX-NEXT: andb %al, %bpl
+; AVX-NEXT: movq %r11, %rax
; AVX-NEXT: mulq %r14
-; AVX-NEXT: movq %rax, %rbp
-; AVX-NEXT: seto %r8b
+; AVX-NEXT: movq %rax, %r8
+; AVX-NEXT: seto %r11b
; AVX-NEXT: movq %r9, %rax
-; AVX-NEXT: mulq %r11
-; AVX-NEXT: seto %cl
-; AVX-NEXT: orb %r8b, %cl
-; AVX-NEXT: addq %rax, %rbp
-; AVX-NEXT: movq %r11, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: seto %r9b
+; AVX-NEXT: orb %r11b, %r9b
+; AVX-NEXT: addq %rax, %r8
+; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %r14
-; AVX-NEXT: addq %rbp, %rdx
-; AVX-NEXT: setb %bl
-; AVX-NEXT: orb %cl, %bl
-; AVX-NEXT: orb %r10b, %bl
-; AVX-NEXT: movzbl %bl, %ecx
+; AVX-NEXT: addq %r8, %rdx
+; AVX-NEXT: setb %cl
+; AVX-NEXT: orb %r9b, %cl
+; AVX-NEXT: orb %bpl, %cl
+; AVX-NEXT: movzbl %cl, %ecx
; AVX-NEXT: negl %ecx
-; AVX-NEXT: movzbl %r13b, %ebp
-; AVX-NEXT: negl %ebp
-; AVX-NEXT: vmovd %ebp, %xmm0
+; AVX-NEXT: movzbl %r10b, %r8d
+; AVX-NEXT: negl %r8d
+; AVX-NEXT: vmovd %r8d, %xmm0
; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, 16(%r15)
-; AVX-NEXT: movq %rdi, (%r15)
-; AVX-NEXT: movq %rdx, 24(%r15)
-; AVX-NEXT: movq %rsi, 8(%r15)
+; AVX-NEXT: movq %rax, 16(%rbx)
+; AVX-NEXT: movq %rdi, (%rbx)
+; AVX-NEXT: movq %rdx, 24(%rbx)
+; AVX-NEXT: movq %rsi, 8(%rbx)
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r12
-; AVX-NEXT: popq %r13
; AVX-NEXT: popq %r14
; AVX-NEXT: popq %r15
; AVX-NEXT: popq %rbp
@@ -3208,63 +3200,63 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: movq %rcx, %rax
-; AVX512F-NEXT: movq %rdx, %r12
-; AVX512F-NEXT: movq %rdi, %r11
+; AVX512F-NEXT: movq %rdx, %rcx
+; AVX512F-NEXT: movq %rsi, %r10
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: testq %r10, %r10
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512F-NEXT: testq %rsi, %rsi
; AVX512F-NEXT: setne %dl
-; AVX512F-NEXT: testq %rcx, %rcx
-; AVX512F-NEXT: setne %bl
-; AVX512F-NEXT: andb %dl, %bl
-; AVX512F-NEXT: mulq %r15
-; AVX512F-NEXT: movq %rax, %rdi
-; AVX512F-NEXT: seto %bpl
-; AVX512F-NEXT: movq %r10, %rax
-; AVX512F-NEXT: mulq %r12
-; AVX512F-NEXT: seto %cl
-; AVX512F-NEXT: orb %bpl, %cl
-; AVX512F-NEXT: leaq (%rdi,%rax), %rbp
-; AVX512F-NEXT: movq %r12, %rax
-; AVX512F-NEXT: mulq %r15
-; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: movq %rdx, %rdi
-; AVX512F-NEXT: addq %rbp, %rdi
+; AVX512F-NEXT: testq %rax, %rax
+; AVX512F-NEXT: setne %bpl
+; AVX512F-NEXT: andb %dl, %bpl
+; AVX512F-NEXT: mulq %r14
+; AVX512F-NEXT: movq %rax, %r11
+; AVX512F-NEXT: seto %r15b
+; AVX512F-NEXT: movq %rsi, %rax
+; AVX512F-NEXT: mulq %rcx
+; AVX512F-NEXT: seto %r12b
+; AVX512F-NEXT: orb %r15b, %r12b
+; AVX512F-NEXT: addq %rax, %r11
+; AVX512F-NEXT: movq %rcx, %rax
+; AVX512F-NEXT: mulq %r14
+; AVX512F-NEXT: movq %rax, %rsi
+; AVX512F-NEXT: movq %rdx, %rcx
+; AVX512F-NEXT: addq %r11, %rcx
; AVX512F-NEXT: setb %al
-; AVX512F-NEXT: orb %cl, %al
-; AVX512F-NEXT: orb %bl, %al
+; AVX512F-NEXT: orb %r12b, %al
+; AVX512F-NEXT: orb %bpl, %al
; AVX512F-NEXT: kmovw %eax, %k0
; AVX512F-NEXT: testq %r9, %r9
; AVX512F-NEXT: setne %al
-; AVX512F-NEXT: testq %rsi, %rsi
-; AVX512F-NEXT: setne %cl
-; AVX512F-NEXT: andb %al, %cl
-; AVX512F-NEXT: movq %rsi, %rax
+; AVX512F-NEXT: testq %r10, %r10
+; AVX512F-NEXT: setne %r11b
+; AVX512F-NEXT: andb %al, %r11b
+; AVX512F-NEXT: movq %r10, %rax
; AVX512F-NEXT: mulq %r8
-; AVX512F-NEXT: movq %rax, %rsi
+; AVX512F-NEXT: movq %rax, %r10
; AVX512F-NEXT: seto %bpl
; AVX512F-NEXT: movq %r9, %rax
-; AVX512F-NEXT: mulq %r11
-; AVX512F-NEXT: seto %bl
-; AVX512F-NEXT: orb %bpl, %bl
-; AVX512F-NEXT: addq %rax, %rsi
-; AVX512F-NEXT: movq %r11, %rax
+; AVX512F-NEXT: mulq %rdi
+; AVX512F-NEXT: seto %r9b
+; AVX512F-NEXT: orb %bpl, %r9b
+; AVX512F-NEXT: addq %rax, %r10
+; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: mulq %r8
-; AVX512F-NEXT: addq %rsi, %rdx
-; AVX512F-NEXT: setb %sil
-; AVX512F-NEXT: orb %bl, %sil
-; AVX512F-NEXT: orb %cl, %sil
-; AVX512F-NEXT: andl $1, %esi
-; AVX512F-NEXT: kmovw %esi, %k1
+; AVX512F-NEXT: addq %r10, %rdx
+; AVX512F-NEXT: setb %dil
+; AVX512F-NEXT: orb %r9b, %dil
+; AVX512F-NEXT: orb %r11b, %dil
+; AVX512F-NEXT: andl $1, %edi
+; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: kshiftlw $1, %k0, %k0
; AVX512F-NEXT: korw %k0, %k1, %k1
; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-NEXT: movq %r10, 16(%r14)
-; AVX512F-NEXT: movq %rax, (%r14)
-; AVX512F-NEXT: movq %rdi, 24(%r14)
-; AVX512F-NEXT: movq %rdx, 8(%r14)
+; AVX512F-NEXT: movq %rsi, 16(%rbx)
+; AVX512F-NEXT: movq %rax, (%rbx)
+; AVX512F-NEXT: movq %rcx, 24(%rbx)
+; AVX512F-NEXT: movq %rdx, 8(%rbx)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r12
; AVX512F-NEXT: popq %r14
@@ -3280,63 +3272,63 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: pushq %r12
; AVX512BW-NEXT: pushq %rbx
; AVX512BW-NEXT: movq %rcx, %rax
-; AVX512BW-NEXT: movq %rdx, %r12
-; AVX512BW-NEXT: movq %rdi, %r11
+; AVX512BW-NEXT: movq %rdx, %rcx
+; AVX512BW-NEXT: movq %rsi, %r10
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: testq %r10, %r10
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512BW-NEXT: testq %rsi, %rsi
; AVX512BW-NEXT: setne %dl
-; AVX512BW-NEXT: testq %rcx, %rcx
-; AVX512BW-NEXT: setne %bl
-; AVX512BW-NEXT: andb %dl, %bl
-; AVX512BW-NEXT: mulq %r15
-; AVX512BW-NEXT: movq %rax, %rdi
-; AVX512BW-NEXT: seto %bpl
-; AVX512BW-NEXT: movq %r10, %rax
-; AVX512BW-NEXT: mulq %r12
-; AVX512BW-NEXT: seto %cl
-; AVX512BW-NEXT: orb %bpl, %cl
-; AVX512BW-NEXT: leaq (%rdi,%rax), %rbp
-; AVX512BW-NEXT: movq %r12, %rax
-; AVX512BW-NEXT: mulq %r15
-; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: movq %rdx, %rdi
-; AVX512BW-NEXT: addq %rbp, %rdi
+; AVX512BW-NEXT: testq %rax, %rax
+; AVX512BW-NEXT: setne %bpl
+; AVX512BW-NEXT: andb %dl, %bpl
+; AVX512BW-NEXT: mulq %r14
+; AVX512BW-NEXT: movq %rax, %r11
+; AVX512BW-NEXT: seto %r15b
+; AVX512BW-NEXT: movq %rsi, %rax
+; AVX512BW-NEXT: mulq %rcx
+; AVX512BW-NEXT: seto %r12b
+; AVX512BW-NEXT: orb %r15b, %r12b
+; AVX512BW-NEXT: addq %rax, %r11
+; AVX512BW-NEXT: movq %rcx, %rax
+; AVX512BW-NEXT: mulq %r14
+; AVX512BW-NEXT: movq %rax, %rsi
+; AVX512BW-NEXT: movq %rdx, %rcx
+; AVX512BW-NEXT: addq %r11, %rcx
; AVX512BW-NEXT: setb %al
-; AVX512BW-NEXT: orb %cl, %al
-; AVX512BW-NEXT: orb %bl, %al
+; AVX512BW-NEXT: orb %r12b, %al
+; AVX512BW-NEXT: orb %bpl, %al
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: testq %r9, %r9
; AVX512BW-NEXT: setne %al
-; AVX512BW-NEXT: testq %rsi, %rsi
-; AVX512BW-NEXT: setne %cl
-; AVX512BW-NEXT: andb %al, %cl
-; AVX512BW-NEXT: movq %rsi, %rax
+; AVX512BW-NEXT: testq %r10, %r10
+; AVX512BW-NEXT: setne %r11b
+; AVX512BW-NEXT: andb %al, %r11b
+; AVX512BW-NEXT: movq %r10, %rax
; AVX512BW-NEXT: mulq %r8
-; AVX512BW-NEXT: movq %rax, %rsi
+; AVX512BW-NEXT: movq %rax, %r10
; AVX512BW-NEXT: seto %bpl
; AVX512BW-NEXT: movq %r9, %rax
-; AVX512BW-NEXT: mulq %r11
-; AVX512BW-NEXT: seto %bl
-; AVX512BW-NEXT: orb %bpl, %bl
-; AVX512BW-NEXT: addq %rax, %rsi
-; AVX512BW-NEXT: movq %r11, %rax
+; AVX512BW-NEXT: mulq %rdi
+; AVX512BW-NEXT: seto %r9b
+; AVX512BW-NEXT: orb %bpl, %r9b
+; AVX512BW-NEXT: addq %rax, %r10
+; AVX512BW-NEXT: movq %rdi, %rax
; AVX512BW-NEXT: mulq %r8
-; AVX512BW-NEXT: addq %rsi, %rdx
-; AVX512BW-NEXT: setb %sil
-; AVX512BW-NEXT: orb %bl, %sil
-; AVX512BW-NEXT: orb %cl, %sil
-; AVX512BW-NEXT: andl $1, %esi
-; AVX512BW-NEXT: kmovw %esi, %k1
+; AVX512BW-NEXT: addq %r10, %rdx
+; AVX512BW-NEXT: setb %dil
+; AVX512BW-NEXT: orb %r9b, %dil
+; AVX512BW-NEXT: orb %r11b, %dil
+; AVX512BW-NEXT: andl $1, %edi
+; AVX512BW-NEXT: kmovw %edi, %k1
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT: movq %r10, 16(%r14)
-; AVX512BW-NEXT: movq %rax, (%r14)
-; AVX512BW-NEXT: movq %rdi, 24(%r14)
-; AVX512BW-NEXT: movq %rdx, 8(%r14)
+; AVX512BW-NEXT: movq %rsi, 16(%rbx)
+; AVX512BW-NEXT: movq %rax, (%rbx)
+; AVX512BW-NEXT: movq %rcx, 24(%rbx)
+; AVX512BW-NEXT: movq %rdx, 8(%rbx)
; AVX512BW-NEXT: popq %rbx
; AVX512BW-NEXT: popq %r12
; AVX512BW-NEXT: popq %r14
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index 7c94da65ff145..2ccd3856ab7c8 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -561,26 +561,26 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; SSE41-NEXT: psubd %xmm4, %xmm8
; SSE41-NEXT: pminud %xmm8, %xmm0
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm9
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psubd %xmm5, %xmm4
-; SSE41-NEXT: pminud %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm1
-; SSE41-NEXT: pxor %xmm9, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm9
+; SSE41-NEXT: psubd %xmm5, %xmm9
+; SSE41-NEXT: pminud %xmm9, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm5
; SSE41-NEXT: psubd %xmm6, %xmm5
; SSE41-NEXT: pminud %xmm5, %xmm2
; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm9, %xmm2
+; SSE41-NEXT: pxor %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm6
; SSE41-NEXT: psubd %xmm7, %xmm6
; SSE41-NEXT: pminud %xmm6, %xmm3
; SSE41-NEXT: pcmpeqd %xmm6, %xmm3
-; SSE41-NEXT: pxor %xmm9, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm3
; SSE41-NEXT: movdqa %xmm6, 48(%rdi)
; SSE41-NEXT: movdqa %xmm5, 32(%rdi)
-; SSE41-NEXT: movdqa %xmm4, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm9, 16(%rdi)
; SSE41-NEXT: movdqa %xmm8, (%rdi)
; SSE41-NEXT: retq
;
@@ -1186,102 +1186,102 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
define <2 x i32> @usubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind {
; SSE2-LABEL: usubo_v2i128:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: xorl %r11d, %r11d
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: xorl %r10d, %r10d
; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movl $0, %eax
-; SSE2-NEXT: sbbl %eax, %eax
+; SSE2-NEXT: movl $0, %r11d
+; SSE2-NEXT: sbbl %r11d, %r11d
; SSE2-NEXT: subq %r8, %rdi
; SSE2-NEXT: sbbq %r9, %rsi
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: sbbl %r11d, %r11d
-; SSE2-NEXT: movd %r11d, %xmm0
+; SSE2-NEXT: movd %r11d, %xmm1
+; SSE2-NEXT: sbbl %r10d, %r10d
+; SSE2-NEXT: movd %r10d, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %rdx, 16(%r10)
-; SSE2-NEXT: movq %rdi, (%r10)
-; SSE2-NEXT: movq %rcx, 24(%r10)
-; SSE2-NEXT: movq %rsi, 8(%r10)
+; SSE2-NEXT: movq %rdx, 16(%rax)
+; SSE2-NEXT: movq %rdi, (%rax)
+; SSE2-NEXT: movq %rcx, 24(%rax)
+; SSE2-NEXT: movq %rsi, 8(%rax)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: usubo_v2i128:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSSE3-NEXT: xorl %r11d, %r11d
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: xorl %r10d, %r10d
; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: movl $0, %eax
-; SSSE3-NEXT: sbbl %eax, %eax
+; SSSE3-NEXT: movl $0, %r11d
+; SSSE3-NEXT: sbbl %r11d, %r11d
; SSSE3-NEXT: subq %r8, %rdi
; SSSE3-NEXT: sbbq %r9, %rsi
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: sbbl %r11d, %r11d
-; SSSE3-NEXT: movd %r11d, %xmm0
+; SSSE3-NEXT: movd %r11d, %xmm1
+; SSSE3-NEXT: sbbl %r10d, %r10d
+; SSSE3-NEXT: movd %r10d, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %rdx, 16(%r10)
-; SSSE3-NEXT: movq %rdi, (%r10)
-; SSSE3-NEXT: movq %rcx, 24(%r10)
-; SSSE3-NEXT: movq %rsi, 8(%r10)
+; SSSE3-NEXT: movq %rdx, 16(%rax)
+; SSSE3-NEXT: movq %rdi, (%rax)
+; SSSE3-NEXT: movq %rcx, 24(%rax)
+; SSSE3-NEXT: movq %rsi, 8(%rax)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: usubo_v2i128:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE41-NEXT: xorl %r11d, %r11d
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE41-NEXT: xorl %r10d, %r10d
; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: movl $0, %eax
-; SSE41-NEXT: sbbl %eax, %eax
+; SSE41-NEXT: movl $0, %r11d
+; SSE41-NEXT: sbbl %r11d, %r11d
; SSE41-NEXT: subq %r8, %rdi
; SSE41-NEXT: sbbq %r9, %rsi
-; SSE41-NEXT: sbbl %r11d, %r11d
-; SSE41-NEXT: movd %r11d, %xmm0
-; SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; SSE41-NEXT: movq %rdx, 16(%r10)
-; SSE41-NEXT: movq %rdi, (%r10)
-; SSE41-NEXT: movq %rcx, 24(%r10)
-; SSE41-NEXT: movq %rsi, 8(%r10)
+; SSE41-NEXT: sbbl %r10d, %r10d
+; SSE41-NEXT: movd %r10d, %xmm0
+; SSE41-NEXT: pinsrd $1, %r11d, %xmm0
+; SSE41-NEXT: movq %rdx, 16(%rax)
+; SSE41-NEXT: movq %rdi, (%rax)
+; SSE41-NEXT: movq %rcx, 24(%rax)
+; SSE41-NEXT: movq %rsi, 8(%rax)
; SSE41-NEXT: retq
;
; AVX-LABEL: usubo_v2i128:
; AVX: # %bb.0:
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX-NEXT: xorl %r11d, %r11d
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: xorl %r10d, %r10d
; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movl $0, %eax
-; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: movl $0, %r11d
+; AVX-NEXT: sbbl %r11d, %r11d
; AVX-NEXT: subq %r8, %rdi
; AVX-NEXT: sbbq %r9, %rsi
-; AVX-NEXT: sbbl %r11d, %r11d
-; AVX-NEXT: vmovd %r11d, %xmm0
-; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq %rdx, 16(%r10)
-; AVX-NEXT: movq %rdi, (%r10)
-; AVX-NEXT: movq %rcx, 24(%r10)
-; AVX-NEXT: movq %rsi, 8(%r10)
+; AVX-NEXT: sbbl %r10d, %r10d
+; AVX-NEXT: vmovd %r10d, %xmm0
+; AVX-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0
+; AVX-NEXT: movq %rdx, 16(%rax)
+; AVX-NEXT: movq %rdi, (%rax)
+; AVX-NEXT: movq %rcx, 24(%rax)
+; AVX-NEXT: movq %rsi, 8(%rax)
; AVX-NEXT: retq
;
; AVX512-LABEL: usubo_v2i128:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; AVX512-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: setb %al
-; AVX512-NEXT: kmovd %eax, %k0
+; AVX512-NEXT: setb %r10b
+; AVX512-NEXT: kmovd %r10d, %k0
; AVX512-NEXT: subq %r8, %rdi
; AVX512-NEXT: sbbq %r9, %rsi
-; AVX512-NEXT: setb %al
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: setb %r8b
+; AVX512-NEXT: andl $1, %r8d
+; AVX512-NEXT: kmovw %r8d, %k1
; AVX512-NEXT: kshiftlw $1, %k0, %k0
; AVX512-NEXT: korw %k0, %k1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: movq %rdx, 16(%r10)
-; AVX512-NEXT: movq %rdi, (%r10)
-; AVX512-NEXT: movq %rcx, 24(%r10)
-; AVX512-NEXT: movq %rsi, 8(%r10)
+; AVX512-NEXT: movq %rdx, 16(%rax)
+; AVX512-NEXT: movq %rdi, (%rax)
+; AVX512-NEXT: movq %rcx, 24(%rax)
+; AVX512-NEXT: movq %rsi, 8(%rax)
; AVX512-NEXT: retq
%t = call {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
%val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 6f6693ee6dbea..73145d8f876ef 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -1569,8 +1569,8 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; SSSE3-NEXT: movdqa %xmm0, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: pand %xmm8, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
; SSSE3-NEXT: pshufb %xmm0, %xmm6
; SSSE3-NEXT: psrlw $4, %xmm5
; SSSE3-NEXT: pand %xmm8, %xmm5
@@ -1580,7 +1580,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; SSSE3-NEXT: por %xmm6, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm5
; SSSE3-NEXT: pand %xmm8, %xmm5
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
; SSSE3-NEXT: pshufb %xmm5, %xmm6
; SSSE3-NEXT: psrlw $4, %xmm1
; SSSE3-NEXT: pand %xmm8, %xmm1
@@ -1589,20 +1589,20 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; SSSE3-NEXT: por %xmm6, %xmm5
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pand %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm7
-; SSSE3-NEXT: pshufb %xmm1, %xmm7
+; SSSE3-NEXT: movdqa %xmm7, %xmm9
+; SSSE3-NEXT: pshufb %xmm1, %xmm9
; SSSE3-NEXT: psrlw $4, %xmm2
; SSSE3-NEXT: pand %xmm8, %xmm2
; SSSE3-NEXT: movdqa %xmm4, %xmm6
; SSSE3-NEXT: pshufb %xmm2, %xmm6
-; SSSE3-NEXT: por %xmm7, %xmm6
+; SSSE3-NEXT: por %xmm9, %xmm6
; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: pand %xmm8, %xmm1
-; SSSE3-NEXT: pshufb %xmm1, %xmm9
+; SSSE3-NEXT: pshufb %xmm1, %xmm7
; SSSE3-NEXT: psrlw $4, %xmm3
; SSSE3-NEXT: pand %xmm8, %xmm3
; SSSE3-NEXT: pshufb %xmm3, %xmm4
-; SSSE3-NEXT: por %xmm9, %xmm4
+; SSSE3-NEXT: por %xmm7, %xmm4
; SSSE3-NEXT: movdqa %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm6, %xmm2
; SSSE3-NEXT: movdqa %xmm4, %xmm3
@@ -1870,46 +1870,46 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; SSSE3-NEXT: pshufb %xmm8, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm9, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: pand %xmm7, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: pshufb %xmm0, %xmm9
; SSSE3-NEXT: psrlw $4, %xmm1
-; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: pand %xmm7, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
; SSSE3-NEXT: movdqa %xmm4, %xmm0
; SSSE3-NEXT: pshufb %xmm1, %xmm0
-; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: por %xmm9, %xmm0
; SSSE3-NEXT: pshufb %xmm8, %xmm5
; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pand %xmm9, %xmm1
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pshufb %xmm1, %xmm6
+; SSSE3-NEXT: pand %xmm7, %xmm1
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: pshufb %xmm1, %xmm9
; SSSE3-NEXT: psrlw $4, %xmm5
-; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: pand %xmm7, %xmm5
; SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSSE3-NEXT: pshufb %xmm5, %xmm1
-; SSSE3-NEXT: por %xmm6, %xmm1
+; SSSE3-NEXT: por %xmm9, %xmm1
; SSSE3-NEXT: pshufb %xmm8, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pand %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pshufb %xmm5, %xmm6
+; SSSE3-NEXT: pand %xmm7, %xmm5
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: pshufb %xmm5, %xmm9
; SSSE3-NEXT: psrlw $4, %xmm2
-; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: pand %xmm7, %xmm2
; SSSE3-NEXT: movdqa %xmm4, %xmm5
; SSSE3-NEXT: pshufb %xmm2, %xmm5
-; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: por %xmm9, %xmm5
; SSSE3-NEXT: pshufb %xmm8, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pand %xmm9, %xmm2
-; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: pshufb %xmm2, %xmm6
; SSSE3-NEXT: psrlw $4, %xmm3
-; SSSE3-NEXT: pand %xmm9, %xmm3
+; SSSE3-NEXT: pand %xmm7, %xmm3
; SSSE3-NEXT: pshufb %xmm3, %xmm4
-; SSSE3-NEXT: por %xmm7, %xmm4
+; SSSE3-NEXT: por %xmm6, %xmm4
; SSSE3-NEXT: movdqa %xmm5, %xmm2
; SSSE3-NEXT: movdqa %xmm4, %xmm3
; SSSE3-NEXT: retq
@@ -2106,12 +2106,12 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v16i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; SSE2-NEXT: packuswb %xmm5, %xmm0
@@ -2129,73 +2129,73 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: psrlw $1, %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm7, %xmm8
; SSE2-NEXT: pand %xmm7, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
+; SSE2-NEXT: por %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: packuswb %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: psrlw $4, %xmm8
+; SSE2-NEXT: pand %xmm5, %xmm8
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: psllw $4, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psrlw $2, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: por %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: psrlw $2, %xmm8
+; SSE2-NEXT: pand %xmm6, %xmm8
; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: psllw $2, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: por %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: psrlw $1, %xmm8
+; SSE2-NEXT: pand %xmm7, %xmm8
; SSE2-NEXT: pand %xmm7, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSE2-NEXT: por %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: packuswb %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: psrlw $4, %xmm8
+; SSE2-NEXT: pand %xmm5, %xmm8
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: psllw $4, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psrlw $2, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: por %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: psrlw $2, %xmm8
+; SSE2-NEXT: pand %xmm6, %xmm8
; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: psllw $2, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: por %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: psrlw $1, %xmm8
+; SSE2-NEXT: pand %xmm7, %xmm8
; SSE2-NEXT: pand %xmm7, %xmm2
; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE2-NEXT: por %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm8
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm4, %xmm3
+; SSE2-NEXT: packuswb %xmm8, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: psrlw $4, %xmm4
; SSE2-NEXT: pand %xmm5, %xmm4
@@ -2222,46 +2222,46 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; SSSE3-NEXT: pshufb %xmm8, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm9, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: pand %xmm7, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: pshufb %xmm0, %xmm9
; SSSE3-NEXT: psrlw $4, %xmm1
-; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: pand %xmm7, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
; SSSE3-NEXT: movdqa %xmm4, %xmm0
; SSSE3-NEXT: pshufb %xmm1, %xmm0
-; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: por %xmm9, %xmm0
; SSSE3-NEXT: pshufb %xmm8, %xmm5
; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pand %xmm9, %xmm1
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pshufb %xmm1, %xmm6
+; SSSE3-NEXT: pand %xmm7, %xmm1
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: pshufb %xmm1, %xmm9
; SSSE3-NEXT: psrlw $4, %xmm5
-; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: pand %xmm7, %xmm5
; SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSSE3-NEXT: pshufb %xmm5, %xmm1
-; SSSE3-NEXT: por %xmm6, %xmm1
+; SSSE3-NEXT: por %xmm9, %xmm1
; SSSE3-NEXT: pshufb %xmm8, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pand %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pshufb %xmm5, %xmm6
+; SSSE3-NEXT: pand %xmm7, %xmm5
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: pshufb %xmm5, %xmm9
; SSSE3-NEXT: psrlw $4, %xmm2
-; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: pand %xmm7, %xmm2
; SSSE3-NEXT: movdqa %xmm4, %xmm5
; SSSE3-NEXT: pshufb %xmm2, %xmm5
-; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: por %xmm9, %xmm5
; SSSE3-NEXT: pshufb %xmm8, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pand %xmm9, %xmm2
-; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: pshufb %xmm2, %xmm6
; SSSE3-NEXT: psrlw $4, %xmm3
-; SSSE3-NEXT: pand %xmm9, %xmm3
+; SSSE3-NEXT: pand %xmm7, %xmm3
; SSSE3-NEXT: pshufb %xmm3, %xmm4
-; SSSE3-NEXT: por %xmm7, %xmm4
+; SSSE3-NEXT: por %xmm6, %xmm4
; SSSE3-NEXT: movdqa %xmm5, %xmm2
; SSSE3-NEXT: movdqa %xmm4, %xmm3
; SSSE3-NEXT: retq
@@ -2458,13 +2458,13 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v8i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
@@ -2483,79 +2483,79 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: psllw $2, %xmm0
; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: psrlw $1, %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm7, %xmm8
; SSE2-NEXT: pand %xmm7, %xmm0
; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
+; SSE2-NEXT: por %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: packuswb %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: psrlw $4, %xmm8
+; SSE2-NEXT: pand %xmm5, %xmm8
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: psllw $4, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psrlw $2, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: por %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: psrlw $2, %xmm8
+; SSE2-NEXT: pand %xmm6, %xmm8
; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: psllw $2, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: por %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: psrlw $1, %xmm8
+; SSE2-NEXT: pand %xmm7, %xmm8
; SSE2-NEXT: pand %xmm7, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSE2-NEXT: por %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: packuswb %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: psrlw $4, %xmm8
+; SSE2-NEXT: pand %xmm5, %xmm8
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: psllw $4, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psrlw $2, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: por %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: psrlw $2, %xmm8
+; SSE2-NEXT: pand %xmm6, %xmm8
; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: psllw $2, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: por %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: psrlw $1, %xmm8
+; SSE2-NEXT: pand %xmm7, %xmm8
; SSE2-NEXT: pand %xmm7, %xmm2
; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE2-NEXT: por %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm8
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: packuswb %xmm4, %xmm3
+; SSE2-NEXT: packuswb %xmm8, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: psrlw $4, %xmm4
; SSE2-NEXT: pand %xmm5, %xmm4
@@ -2582,46 +2582,46 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; SSSE3-NEXT: pshufb %xmm8, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm9, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: pand %xmm7, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: pshufb %xmm0, %xmm9
; SSSE3-NEXT: psrlw $4, %xmm1
-; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: pand %xmm7, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
; SSSE3-NEXT: movdqa %xmm4, %xmm0
; SSSE3-NEXT: pshufb %xmm1, %xmm0
-; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: por %xmm9, %xmm0
; SSSE3-NEXT: pshufb %xmm8, %xmm5
; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pand %xmm9, %xmm1
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pshufb %xmm1, %xmm6
+; SSSE3-NEXT: pand %xmm7, %xmm1
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: pshufb %xmm1, %xmm9
; SSSE3-NEXT: psrlw $4, %xmm5
-; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: pand %xmm7, %xmm5
; SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSSE3-NEXT: pshufb %xmm5, %xmm1
-; SSSE3-NEXT: por %xmm6, %xmm1
+; SSSE3-NEXT: por %xmm9, %xmm1
; SSSE3-NEXT: pshufb %xmm8, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pand %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pshufb %xmm5, %xmm6
+; SSSE3-NEXT: pand %xmm7, %xmm5
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: pshufb %xmm5, %xmm9
; SSSE3-NEXT: psrlw $4, %xmm2
-; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: pand %xmm7, %xmm2
; SSSE3-NEXT: movdqa %xmm4, %xmm5
; SSSE3-NEXT: pshufb %xmm2, %xmm5
-; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: por %xmm9, %xmm5
; SSSE3-NEXT: pshufb %xmm8, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pand %xmm9, %xmm2
-; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: pshufb %xmm2, %xmm6
; SSSE3-NEXT: psrlw $4, %xmm3
-; SSSE3-NEXT: pand %xmm9, %xmm3
+; SSSE3-NEXT: pand %xmm7, %xmm3
; SSSE3-NEXT: pshufb %xmm3, %xmm4
-; SSSE3-NEXT: por %xmm7, %xmm4
+; SSSE3-NEXT: por %xmm6, %xmm4
; SSSE3-NEXT: movdqa %xmm5, %xmm2
; SSSE3-NEXT: movdqa %xmm4, %xmm3
; SSSE3-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll
index 568dff8e13585..5a307925bb7fe 100644
--- a/llvm/test/CodeGen/X86/vector-compare-results.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-results.ll
@@ -1109,33 +1109,33 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE2-NEXT: pcmpeqd %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; SSE2-NEXT: pand %xmm11, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm10
-; SSE2-NEXT: packssdw %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm6
+; SSE2-NEXT: packssdw %xmm9, %xmm6
; SSE2-NEXT: pxor %xmm8, %xmm5
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm7
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: packssdw %xmm7, %xmm4
; SSE2-NEXT: packssdw %xmm6, %xmm4
-; SSE2-NEXT: packssdw %xmm10, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm3
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm5
@@ -1210,13 +1210,13 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm8
+; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpackssdw %xmm8, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3
@@ -1313,13 +1313,13 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
; AVX1-NEXT: vpcmpgtd %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm8
+; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpcmpgtd %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpacksswb %xmm8, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
@@ -1802,13 +1802,13 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; AVX1-NEXT: vmovapd 176(%rbp), %ymm13
; AVX1-NEXT: vmovapd 208(%rbp), %ymm14
; AVX1-NEXT: vmovapd 240(%rbp), %ymm15
-; AVX1-NEXT: vcmpltpd %ymm7, %ymm15, %ymm15
-; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm7
-; AVX1-NEXT: vpackssdw %xmm7, %xmm15, %xmm15
+; AVX1-NEXT: vcmpltpd %ymm7, %ymm15, %ymm7
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm15
+; AVX1-NEXT: vpackssdw %xmm15, %xmm7, %xmm7
; AVX1-NEXT: vcmpltpd %ymm6, %ymm14, %ymm6
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm14
+; AVX1-NEXT: vpackssdw %xmm14, %xmm6, %xmm6
; AVX1-NEXT: vpackssdw %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpackssdw %xmm15, %xmm6, %xmm6
; AVX1-NEXT: vcmpltpd %ymm5, %ymm13, %ymm5
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7
; AVX1-NEXT: vpackssdw %xmm7, %xmm5, %xmm5
@@ -1948,33 +1948,33 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE2-NEXT: pcmpeqd %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; SSE2-NEXT: pand %xmm11, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm10
-; SSE2-NEXT: packssdw %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm6
+; SSE2-NEXT: packssdw %xmm9, %xmm6
; SSE2-NEXT: pxor %xmm8, %xmm5
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm7
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: packssdw %xmm7, %xmm4
; SSE2-NEXT: packssdw %xmm6, %xmm4
-; SSE2-NEXT: packssdw %xmm10, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm3
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm5
@@ -2189,12 +2189,12 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
; AVX1-NEXT: vpcmpgtq 256(%rbp), %xmm8, %xmm8
; AVX1-NEXT: vpcmpgtq 240(%rbp), %xmm7, %xmm7
-; AVX1-NEXT: vpackssdw %xmm8, %xmm7, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
-; AVX1-NEXT: vpcmpgtq 224(%rbp), %xmm7, %xmm7
+; AVX1-NEXT: vpackssdw %xmm8, %xmm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm8
+; AVX1-NEXT: vpcmpgtq 224(%rbp), %xmm8, %xmm8
; AVX1-NEXT: vpcmpgtq 208(%rbp), %xmm6, %xmm6
-; AVX1-NEXT: vpackssdw %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpackssdw %xmm8, %xmm6, %xmm6
+; AVX1-NEXT: vpackssdw %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7
; AVX1-NEXT: vpcmpgtq 192(%rbp), %xmm7, %xmm7
; AVX1-NEXT: vpcmpgtq 176(%rbp), %xmm5, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index f49e6ed5ab03c..7c6199f30a756 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -237,26 +237,26 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; SSE41-LABEL: var_funnnel_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31]
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31]
; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pandn %xmm8, %xmm4
+; SSE41-NEXT: pandn %xmm3, %xmm4
; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
; SSE41-NEXT: psrld $1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm6
; SSE41-NEXT: psrld %xmm5, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrld %xmm7, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm8
+; SSE41-NEXT: psrld %xmm7, %xmm8
+; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm6
; SSE41-NEXT: psrld %xmm4, %xmm6
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
; SSE41-NEXT: psrld %xmm4, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7]
-; SSE41-NEXT: pand %xmm8, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
+; SSE41-NEXT: pand %xmm3, %xmm2
; SSE41-NEXT: pslld $23, %xmm2
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 71732bcb72041..5cc7cd2fc45da 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -163,44 +163,44 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [31,31,31,31]
-; AVX1-NEXT: vpxor %xmm3, %xmm8, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
; AVX1-NEXT: vpsrld $1, %xmm7, %xmm7
; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm4
-; AVX1-NEXT: vpsrld %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
-; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm8
+; AVX1-NEXT: vpsrld %xmm8, %xmm7, %xmm8
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
+; AVX1-NEXT: vpsrld %xmm9, %xmm7, %xmm9
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
; AVX1-NEXT: vpsrld %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm9[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpmulld %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm2, %xmm8, %xmm4
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT: vpmulld %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7
; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -479,65 +479,65 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm7, %xmm9, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7
; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm6, %xmm9, %xmm6
+; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
-; AVX1-NEXT: vpmullw %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm6, %xmm9, %xmm6
-; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
+; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6
+; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT: vpslld $23, %xmm9, %xmm9
+; AVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm9
+; AVX1-NEXT: vcvttps2dq %xmm9, %xmm9
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm4
+; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
-; AVX1-NEXT: vpackusdw %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
-; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpackusdw %xmm4, %xmm9, %xmm4
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vpaddd %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm4
-; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
-; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
+; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
+; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vpaddd %xmm5, %xmm9, %xmm5
+; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5
; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
+; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
+; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm9, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v32i8:
@@ -1258,7 +1258,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) {
; AVX1-NEXT: vmovd %edx, %xmm1
; AVX1-NEXT: vmovd %ecx, %xmm2
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,0,0,0]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
@@ -1266,32 +1266,32 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) {
; AVX1-NEXT: .LBB8_1: # %loop
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX1-NEXT: vpcmpeqb %xmm3, %xmm8, %xmm3
-; AVX1-NEXT: vpmovsxbd %xmm3, %xmm10
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; AVX1-NEXT: vpmovsxbd %xmm3, %xmm9
+; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vmovdqu 4096(%rdi,%rax,4), %xmm5
; AVX1-NEXT: vmovdqu 4112(%rdi,%rax,4), %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,3,3]
-; AVX1-NEXT: vpsllq %xmm1, %xmm7, %xmm0
+; AVX1-NEXT: vpsllq %xmm1, %xmm7, %xmm8
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm5, %xmm3
-; AVX1-NEXT: vshufps {{.*#+}} xmm11 = xmm3[1,3],xmm0[1,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,3,3]
-; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpsllq %xmm1, %xmm5, %xmm9
+; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm9[1,3],xmm8[1,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,2,3,3]
+; AVX1-NEXT: vpsllq %xmm1, %xmm9, %xmm10
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm6, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
-; AVX1-NEXT: vpsllq %xmm2, %xmm7, %xmm4
+; AVX1-NEXT: vpsllq %xmm1, %xmm6, %xmm11
+; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,3],xmm10[1,3]
+; AVX1-NEXT: vpsllq %xmm2, %xmm7, %xmm7
; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5
-; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,3],xmm4[1,3]
-; AVX1-NEXT: vblendvps %xmm10, %xmm11, %xmm4, %xmm4
-; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpsllq %xmm2, %xmm6, %xmm5
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,3],xmm3[1,3]
-; AVX1-NEXT: vblendvps %xmm9, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3]
+; AVX1-NEXT: vblendvps %xmm4, %xmm8, %xmm5, %xmm4
+; AVX1-NEXT: vpsllq %xmm2, %xmm9, %xmm5
+; AVX1-NEXT: vpsllq %xmm2, %xmm6, %xmm6
+; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm6[1,3],xmm5[1,3]
+; AVX1-NEXT: vblendvps %xmm3, %xmm10, %xmm5, %xmm3
; AVX1-NEXT: vmovups %xmm4, 4096(%rdi,%rax,4)
-; AVX1-NEXT: vmovups %xmm0, 4112(%rdi,%rax,4)
+; AVX1-NEXT: vmovups %xmm3, 4112(%rdi,%rax,4)
; AVX1-NEXT: addq $8, %rax
; AVX1-NEXT: jne .LBB8_1
; AVX1-NEXT: # %bb.2: # %exit
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index d45fc6988f3a4..d133dae69fe65 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -398,10 +398,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
-; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7
-; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8
+; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
@@ -419,7 +419,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 97046ac71270e..176276fec26a3 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -238,25 +238,25 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; SSE41-LABEL: var_funnnel_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31]
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31]
; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pand %xmm8, %xmm4
+; SSE41-NEXT: pand %xmm3, %xmm4
; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm6
; SSE41-NEXT: psrld %xmm5, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrld %xmm7, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm8
+; SSE41-NEXT: psrld %xmm7, %xmm8
+; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm6
; SSE41-NEXT: psrld %xmm4, %xmm6
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
; SSE41-NEXT: psrld %xmm4, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7]
-; SSE41-NEXT: pandn %xmm8, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
+; SSE41-NEXT: pandn %xmm3, %xmm2
; SSE41-NEXT: pslld $23, %xmm2
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 6d7496e94476b..77c228a71b264 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -170,37 +170,37 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
; AVX1-NEXT: vpsrld %xmm6, %xmm3, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
; AVX1-NEXT: vpsrld %xmm7, %xmm3, %xmm7
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT: vpsrld %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT: vpsrld %xmm8, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5],xmm5[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [31,31,31,31]
-; AVX1-NEXT: vpxor %xmm4, %xmm9, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [31,31,31,31]
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
+; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vpmulld %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm7
-; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
-; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm8
+; AVX1-NEXT: vpsrld %xmm8, %xmm1, %xmm8
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT: vpsrld %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; AVX1-NEXT: vpxor %xmm2, %xmm9, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
@@ -335,8 +335,8 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm6
; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpxor %xmm3, %xmm8, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm6
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm7
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
@@ -355,17 +355,17 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX1-NEXT: vpsllw $4, %xmm2, %xmm7
; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm7
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm5
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm5
-; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm5
-; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm8
+; AVX1-NEXT: vpblendvb %xmm6, %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6
+; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm6
+; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm6
+; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5
@@ -512,66 +512,66 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm5
; AVX1-NEXT: vpsllw $4, %xmm5, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm6
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX1-NEXT: vpxor %xmm7, %xmm9, %xmm3
-; AVX1-NEXT: vpsllw $5, %xmm3, %xmm3
-; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm6
-; AVX1-NEXT: vpsllw $2, %xmm6, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm4
-; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6
-; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm4, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm6, %xmm11, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8
+; AVX1-NEXT: vpsllw $5, %xmm8, %xmm8
+; AVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm5, %xmm6
+; AVX1-NEXT: vpsllw $2, %xmm6, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm9
+; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm6
+; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm9
+; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
+; AVX1-NEXT: vpsrlw $4, %xmm8, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm10, %xmm9, %xmm9
; AVX1-NEXT: vpsllw $5, %xmm7, %xmm7
-; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT: vpand %xmm6, %xmm12, %xmm6
+; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm8
+; AVX1-NEXT: vpsrlw $2, %xmm8, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm11, %xmm9, %xmm9
; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm8
+; AVX1-NEXT: vpsrlw $1, %xmm8, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm12, %xmm9, %xmm9
; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm7
+; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm4
-; AVX1-NEXT: vpxor %xmm2, %xmm9, %xmm6
-; AVX1-NEXT: vpsllw $5, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm4
-; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
-; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm4, %xmm11, %xmm4
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm7
+; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3
+; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm3
+; AVX1-NEXT: vpand %xmm3, %xmm10, %xmm3
; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm4, %xmm12, %xmm4
+; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm3
+; AVX1-NEXT: vpand %xmm3, %xmm11, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm3
+; AVX1-NEXT: vpand %xmm3, %xmm12, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v32i8:
@@ -1599,43 +1599,43 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX1-LABEL: constant_funnnel_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [256,2,4,8,16,32,64,128]
-; AVX1-NEXT: vpmullw %xmm4, %xmm9, %xmm4
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,2,4,8,16,32,64,128]
+; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [256,128,64,32,16,8,4,2]
-; AVX1-NEXT: vpmullw %xmm2, %xmm10, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [256,128,64,32,16,8,4,2]
+; AVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,1,2,4,8,16,32,64]
-; AVX1-NEXT: vpmullw %xmm3, %xmm7, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,1,2,4,8,16,32,64]
+; AVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm7
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [128,64,32,16,8,4,2,1]
-; AVX1-NEXT: vpmullw %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,64,32,16,8,4,2,1]
+; AVX1-NEXT: vpmullw %xmm4, %xmm10, %xmm4
+; AVX1-NEXT: vpand %xmm4, %xmm9, %xmm4
; AVX1-NEXT: vpackuswb %xmm7, %xmm4, %xmm4
; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm9, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmullw %xmm1, %xmm10, %xmm1
+; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmullw %xmm3, %xmm8, %xmm3
+; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm0, %xmm10, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 1380a5496eaac..810e395f727da 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -134,8 +134,8 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpsubd %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
@@ -144,15 +144,15 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpmuludq %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm1, %xmm8, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5],xmm6[6,7]
+; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
@@ -408,23 +408,23 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpsubb %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpsubb %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX1-NEXT: vpandn %xmm3, %xmm7, %xmm3
-; AVX1-NEXT: vpsllw $2, %xmm2, %xmm6
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsllw $2, %xmm2, %xmm8
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
+; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3
-; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm6
-; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm9
+; AVX1-NEXT: vpor %xmm3, %xmm9, %xmm3
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
@@ -432,7 +432,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsubb %xmm1, %xmm8, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm6, %xmm1
; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3
@@ -443,7 +443,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3
+; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index 17dc0778d6830..279f3c464411c 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -555,18 +555,18 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vpxor %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpsubb %xmm8, %xmm3, %xmm3
; AVX1-NEXT: vpsllw $3, %xmm3, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5
; AVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
@@ -578,14 +578,14 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm2, %xmm9, %xmm2
-; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm8, %xmm2
; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpsubb %xmm8, %xmm2, %xmm2
; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 56a2f7f69bb3f..805dd422ac491 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -578,8 +578,8 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: test_rem7_32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
@@ -596,26 +596,26 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpsllw $3, %xmm3, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
-; AVX1-NEXT: vpand %xmm2, %xmm7, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm4
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpsllw $3, %xmm3, %xmm4
-; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
+; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll
index 311122112729e..541afdbc229ad 100644
--- a/llvm/test/CodeGen/X86/vector-interleave.ll
+++ b/llvm/test/CodeGen/X86/vector-interleave.ll
@@ -14,17 +14,17 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x
; SSE-NEXT: movdqa %xmm0, %xmm8
; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3]
; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: movdqa %xmm8, %xmm2
-; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
-; SSE-NEXT: movdqa %xmm4, %xmm1
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE-NEXT: movdqa %xmm8, %xmm1
+; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; SSE-NEXT: movdqa %xmm4, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
; SSE-NEXT: movdqa %xmm7, %xmm5
; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
@@ -32,15 +32,15 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x
; SSE-NEXT: movdqa %xmm4, %xmm6
; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; SSE-NEXT: movdqa %xmm1, %xmm7
+; SSE-NEXT: movdqa %xmm2, %xmm7
; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSE-NEXT: movdqa %xmm8, %xmm5
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
; SSE-NEXT: movdqa %xmm0, %xmm7
; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
@@ -51,8 +51,8 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x
; SSE-NEXT: movdqa %xmm4, 96(%rdi)
; SSE-NEXT: movdqa %xmm0, 80(%rdi)
; SSE-NEXT: movdqa %xmm7, 64(%rdi)
-; SSE-NEXT: movdqa %xmm2, 48(%rdi)
-; SSE-NEXT: movdqa %xmm1, 32(%rdi)
+; SSE-NEXT: movdqa %xmm1, 48(%rdi)
+; SSE-NEXT: movdqa %xmm2, 32(%rdi)
; SSE-NEXT: movdqa %xmm8, 16(%rdi)
; SSE-NEXT: movdqa %xmm5, (%rdi)
; SSE-NEXT: retq
@@ -63,9 +63,9 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
@@ -78,15 +78,15 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleave8x8:
@@ -95,9 +95,9 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
@@ -110,15 +110,15 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX2-NEXT: retq
%ab = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
%cd = shufflevector <8 x i16> %c, <8 x i16> %d, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index 18dbb3ffec159..6e2f1b7fa8117 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -263,57 +263,57 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
; SSE-LABEL: vf32:
; SSE: # %bb.0:
; SSE-NEXT: movdqa 64(%rdi), %xmm1
-; SSE-NEXT: movdqa 80(%rdi), %xmm10
+; SSE-NEXT: movdqa 80(%rdi), %xmm4
; SSE-NEXT: movdqa 96(%rdi), %xmm0
; SSE-NEXT: movdqa 112(%rdi), %xmm7
; SSE-NEXT: movdqa (%rdi), %xmm3
-; SSE-NEXT: movdqa 16(%rdi), %xmm11
+; SSE-NEXT: movdqa 16(%rdi), %xmm6
; SSE-NEXT: movdqa 32(%rdi), %xmm2
-; SSE-NEXT: movdqa 48(%rdi), %xmm5
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm4[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm4[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm6[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; SSE-NEXT: psrad $16, %xmm5
+; SSE-NEXT: movdqa 48(%rdi), %xmm9
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm8[0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm11[0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm11[0]
+; SSE-NEXT: psrad $16, %xmm9
; SSE-NEXT: psrad $16, %xmm2
-; SSE-NEXT: packssdw %xmm5, %xmm2
+; SSE-NEXT: packssdw %xmm9, %xmm2
; SSE-NEXT: psrad $16, %xmm7
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: packssdw %xmm7, %xmm0
-; SSE-NEXT: psrad $16, %xmm11
+; SSE-NEXT: psrad $16, %xmm6
; SSE-NEXT: psrad $16, %xmm3
-; SSE-NEXT: packssdw %xmm11, %xmm3
-; SSE-NEXT: psrad $16, %xmm10
+; SSE-NEXT: packssdw %xmm6, %xmm3
+; SSE-NEXT: psrad $16, %xmm4
; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: packssdw %xmm10, %xmm1
-; SSE-NEXT: movdqa %xmm4, 32(%rsi)
-; SSE-NEXT: movdqa %xmm12, (%rsi)
-; SSE-NEXT: movdqa %xmm9, 48(%rsi)
-; SSE-NEXT: movdqa %xmm8, 16(%rsi)
+; SSE-NEXT: packssdw %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm12, 32(%rsi)
+; SSE-NEXT: movdqa %xmm10, (%rsi)
+; SSE-NEXT: movdqa %xmm8, 48(%rsi)
+; SSE-NEXT: movdqa %xmm5, 16(%rsi)
; SSE-NEXT: movdqa %xmm1, 32(%rdx)
; SSE-NEXT: movdqa %xmm3, (%rdx)
; SSE-NEXT: movdqa %xmm0, 48(%rdx)
@@ -323,46 +323,46 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
; AVX1-LABEL: vf32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa 112(%rdi), %xmm10
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm0[1],xmm10[2],xmm0[3],xmm10[4],xmm0[5],xmm10[6],xmm0[7]
-; AVX1-NEXT: vmovdqa 96(%rdi), %xmm11
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm0[1],xmm11[2],xmm0[3],xmm11[4],xmm0[5],xmm11[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm8
-; AVX1-NEXT: vmovdqa 80(%rdi), %xmm12
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm0[1],xmm12[2],xmm0[3],xmm12[4],xmm0[5],xmm12[6],xmm0[7]
+; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX1-NEXT: vmovdqa 96(%rdi), %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm5, %xmm7, %xmm9
+; AVX1-NEXT: vpackusdw %xmm5, %xmm7, %xmm5
; AVX1-NEXT: vmovdqa (%rdi), %xmm7
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm5
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm8
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm9
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm10
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0],xmm0[1],xmm10[2],xmm0[3],xmm10[4],xmm0[5],xmm10[6],xmm0[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0],xmm0[1],xmm9[2],xmm0[3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
+; AVX1-NEXT: vpackusdw %xmm11, %xmm12, %xmm11
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm0[1],xmm8[2],xmm0[3],xmm8[4],xmm0[5],xmm8[6],xmm0[7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7]
-; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm12, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm10, %xmm10
+; AVX1-NEXT: vpsrld $16, %xmm9, %xmm9
+; AVX1-NEXT: vpackusdw %xmm10, %xmm9, %xmm9
+; AVX1-NEXT: vpsrld $16, %xmm8, %xmm8
+; AVX1-NEXT: vpsrld $16, %xmm7, %xmm7
+; AVX1-NEXT: vpackusdw %xmm8, %xmm7, %xmm7
; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm5, %xmm4
-; AVX1-NEXT: vpackusdw %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm7, %xmm4
-; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm10, %xmm4
-; AVX1-NEXT: vpsrld $16, %xmm11, %xmm5
-; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsrld $16, %xmm12, %xmm5
-; AVX1-NEXT: vpsrld $16, %xmm6, %xmm6
-; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3
+; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm4, %xmm3
+; AVX1-NEXT: vpsrld $16, %xmm6, %xmm4
+; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT: vmovdqa %xmm3, 16(%rsi)
-; AVX1-NEXT: vmovdqa %xmm9, 32(%rsi)
-; AVX1-NEXT: vmovdqa %xmm8, 48(%rsi)
-; AVX1-NEXT: vmovdqa %xmm5, 32(%rdx)
-; AVX1-NEXT: vmovdqa %xmm4, 48(%rdx)
-; AVX1-NEXT: vmovdqa %xmm2, (%rdx)
-; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm11, 16(%rsi)
+; AVX1-NEXT: vmovdqa %xmm5, 32(%rsi)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%rsi)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm7, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm9, 16(%rdx)
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vf32:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index caff86307237c..b5e0f0039b78b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -197,7 +197,7 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounw
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm3
; SSE-NEXT: movdqa 16(%rdi), %xmm2
-; SSE-NEXT: movdqa 32(%rdi), %xmm8
+; SSE-NEXT: movdqa 32(%rdi), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: pand %xmm1, %xmm4
@@ -208,7 +208,7 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounw
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
@@ -225,12 +225,12 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounw
; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
; SSE-NEXT: pand %xmm6, %xmm5
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6]
-; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: pandn %xmm7, %xmm0
-; SSE-NEXT: por %xmm5, %xmm0
+; SSE-NEXT: movdqa %xmm6, %xmm8
+; SSE-NEXT: pandn %xmm7, %xmm8
+; SSE-NEXT: por %xmm5, %xmm8
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: pandn %xmm3, %xmm4
; SSE-NEXT: por %xmm2, %xmm4
@@ -239,12 +239,12 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounw
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
-; SSE-NEXT: pandn %xmm3, %xmm6
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; SSE-NEXT: pandn %xmm0, %xmm6
; SSE-NEXT: por %xmm2, %xmm6
; SSE-NEXT: movaps %xmm1, (%rsi)
-; SSE-NEXT: movdqa %xmm0, (%rdx)
+; SSE-NEXT: movdqa %xmm8, (%rdx)
; SSE-NEXT: movdqa %xmm6, (%rcx)
; SSE-NEXT: retq
;
@@ -321,107 +321,107 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounw
define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
; SSE-LABEL: vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa 80(%rdi), %xmm8
-; SSE-NEXT: movdqa 64(%rdi), %xmm11
-; SSE-NEXT: movdqa (%rdi), %xmm2
+; SSE-NEXT: movdqa 80(%rdi), %xmm0
+; SSE-NEXT: movdqa 64(%rdi), %xmm1
+; SSE-NEXT: movdqa (%rdi), %xmm7
; SSE-NEXT: movdqa 16(%rdi), %xmm4
-; SSE-NEXT: movdqa 32(%rdi), %xmm10
-; SSE-NEXT: movdqa 48(%rdi), %xmm9
+; SSE-NEXT: movdqa 32(%rdi), %xmm3
+; SSE-NEXT: movdqa 48(%rdi), %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: movdqa %xmm5, %xmm0
-; SSE-NEXT: pandn %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm5, %xmm8
+; SSE-NEXT: pandn %xmm4, %xmm8
; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm2, %xmm7
-; SSE-NEXT: movdqa %xmm6, %xmm3
-; SSE-NEXT: pandn %xmm2, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0]
-; SSE-NEXT: movdqa %xmm9, %xmm1
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pandn %xmm11, %xmm5
-; SSE-NEXT: por %xmm1, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm1[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm1[2,0]
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0]
-; SSE-NEXT: movdqa %xmm6, %xmm1
-; SSE-NEXT: pandn %xmm4, %xmm1
-; SSE-NEXT: pand %xmm6, %xmm7
-; SSE-NEXT: por %xmm1, %xmm7
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0]
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6]
-; SSE-NEXT: movdqa %xmm5, %xmm0
-; SSE-NEXT: pandn %xmm7, %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm6, %xmm1
-; SSE-NEXT: pandn %xmm11, %xmm1
-; SSE-NEXT: movdqa %xmm9, %xmm7
-; SSE-NEXT: pand %xmm6, %xmm7
-; SSE-NEXT: por %xmm1, %xmm7
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6]
-; SSE-NEXT: movdqa %xmm5, %xmm2
-; SSE-NEXT: pandn %xmm7, %xmm2
-; SSE-NEXT: por %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm7, %xmm9
+; SSE-NEXT: movdqa %xmm6, %xmm10
+; SSE-NEXT: pandn %xmm7, %xmm10
+; SSE-NEXT: pand %xmm5, %xmm7
+; SSE-NEXT: por %xmm8, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,1,2,1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0]
+; SSE-NEXT: movdqa %xmm2, %xmm8
+; SSE-NEXT: pand %xmm5, %xmm8
+; SSE-NEXT: pandn %xmm1, %xmm5
+; SSE-NEXT: por %xmm8, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,1,2,1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,0]
+; SSE-NEXT: movdqa %xmm6, %xmm8
+; SSE-NEXT: pandn %xmm4, %xmm8
+; SSE-NEXT: pand %xmm6, %xmm9
+; SSE-NEXT: por %xmm8, %xmm9
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,0,0]
+; SSE-NEXT: pand %xmm9, %xmm8
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6]
+; SSE-NEXT: movdqa %xmm9, %xmm12
+; SSE-NEXT: pandn %xmm11, %xmm12
+; SSE-NEXT: por %xmm8, %xmm12
+; SSE-NEXT: movdqa %xmm6, %xmm8
+; SSE-NEXT: pandn %xmm1, %xmm8
+; SSE-NEXT: movdqa %xmm2, %xmm11
+; SSE-NEXT: pand %xmm6, %xmm11
+; SSE-NEXT: por %xmm8, %xmm11
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
+; SSE-NEXT: pand %xmm9, %xmm8
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6]
+; SSE-NEXT: movdqa %xmm9, %xmm13
+; SSE-NEXT: pandn %xmm11, %xmm13
+; SSE-NEXT: por %xmm8, %xmm13
; SSE-NEXT: pand %xmm6, %xmm4
-; SSE-NEXT: por %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,7,6,7]
+; SSE-NEXT: por %xmm10, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7]
+; SSE-NEXT: pand %xmm9, %xmm4
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: pandn %xmm3, %xmm4
-; SSE-NEXT: por %xmm1, %xmm4
-; SSE-NEXT: pand %xmm6, %xmm11
-; SSE-NEXT: pandn %xmm9, %xmm6
-; SSE-NEXT: por %xmm11, %xmm6
+; SSE-NEXT: movdqa %xmm9, %xmm8
+; SSE-NEXT: pandn %xmm3, %xmm8
+; SSE-NEXT: por %xmm4, %xmm8
+; SSE-NEXT: pand %xmm6, %xmm1
+; SSE-NEXT: pandn %xmm2, %xmm6
+; SSE-NEXT: por %xmm1, %xmm6
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
-; SSE-NEXT: pandn %xmm3, %xmm5
-; SSE-NEXT: por %xmm1, %xmm5
-; SSE-NEXT: movaps %xmm13, 16(%rsi)
-; SSE-NEXT: movaps %xmm12, (%rsi)
-; SSE-NEXT: movdqa %xmm2, 16(%rdx)
-; SSE-NEXT: movdqa %xmm0, (%rdx)
-; SSE-NEXT: movdqa %xmm5, 16(%rcx)
-; SSE-NEXT: movdqa %xmm4, (%rcx)
+; SSE-NEXT: pand %xmm9, %xmm1
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; SSE-NEXT: pandn %xmm0, %xmm9
+; SSE-NEXT: por %xmm1, %xmm9
+; SSE-NEXT: movaps %xmm5, 16(%rsi)
+; SSE-NEXT: movaps %xmm7, (%rsi)
+; SSE-NEXT: movdqa %xmm13, 16(%rdx)
+; SSE-NEXT: movdqa %xmm12, (%rdx)
+; SSE-NEXT: movdqa %xmm9, 16(%rcx)
+; SSE-NEXT: movdqa %xmm8, (%rcx)
; SSE-NEXT: retq
;
; AVX1-LABEL: vf16:
@@ -436,21 +436,21 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) noun
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm6
; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,3,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2],xmm2[3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,2,1]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm7[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm8
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5],xmm7[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm7[3,4,5,6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm7[5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3,4],xmm8[5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,1,2,3]
@@ -460,9 +460,9 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) noun
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7]
-; AVX1-NEXT: vmovaps %ymm8, (%rsi)
-; AVX1-NEXT: vmovdqa %xmm2, (%rdx)
-; AVX1-NEXT: vmovdqa %xmm9, 16(%rdx)
+; AVX1-NEXT: vmovaps %ymm2, (%rsi)
+; AVX1-NEXT: vmovdqa %xmm8, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm7, 16(%rdx)
; AVX1-NEXT: vmovdqa %xmm1, (%rcx)
; AVX1-NEXT: vmovdqa %xmm0, 16(%rcx)
; AVX1-NEXT: vzeroupper
@@ -542,398 +542,404 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) noun
define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
; SSE-LABEL: vf32:
; SSE: # %bb.0:
-; SSE-NEXT: subq $72, %rsp
-; SSE-NEXT: movdqa 96(%rdi), %xmm11
-; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 176(%rdi), %xmm7
-; SSE-NEXT: movdqa 144(%rdi), %xmm9
-; SSE-NEXT: movdqa 160(%rdi), %xmm5
-; SSE-NEXT: movdqa 80(%rdi), %xmm1
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa (%rdi), %xmm14
-; SSE-NEXT: movdqa 16(%rdi), %xmm10
-; SSE-NEXT: movdqa 32(%rdi), %xmm13
+; SSE-NEXT: subq $56, %rsp
+; SSE-NEXT: movdqa 96(%rdi), %xmm5
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 176(%rdi), %xmm14
+; SSE-NEXT: movdqa 144(%rdi), %xmm12
+; SSE-NEXT: movdqa 160(%rdi), %xmm10
+; SSE-NEXT: movdqa 80(%rdi), %xmm13
; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 48(%rdi), %xmm8
-; SSE-NEXT: movdqa 64(%rdi), %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm8, %xmm3
-; SSE-NEXT: pand %xmm0, %xmm3
+; SSE-NEXT: movdqa (%rdi), %xmm9
+; SSE-NEXT: movdqa 16(%rdi), %xmm8
+; SSE-NEXT: movdqa 32(%rdi), %xmm6
+; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 48(%rdi), %xmm0
+; SSE-NEXT: movdqa 64(%rdi), %xmm11
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pandn %xmm11, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pand %xmm1, %xmm3
; SSE-NEXT: por %xmm2, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,7,6,7]
-; SSE-NEXT: movdqa %xmm6, %xmm15
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,2,1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,7,6,7]
+; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0]
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm5, %xmm2
-; SSE-NEXT: movdqa %xmm9, %xmm3
-; SSE-NEXT: pand %xmm0, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pandn %xmm10, %xmm2
+; SSE-NEXT: movdqa %xmm12, %xmm3
+; SSE-NEXT: pand %xmm1, %xmm3
; SSE-NEXT: por %xmm2, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm12
-; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7]
+; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pandn %xmm10, %xmm2
-; SSE-NEXT: movdqa %xmm14, %xmm3
-; SSE-NEXT: pand %xmm0, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pandn %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm9, %xmm3
+; SSE-NEXT: pand %xmm1, %xmm3
; SSE-NEXT: por %xmm2, %xmm3
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,2,1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0]
; SSE-NEXT: movdqa 112(%rdi), %xmm6
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,1,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm11, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm6, %xmm0
-; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: movdqa 128(%rdi), %xmm1
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
+; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm5, %xmm2
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: pandn %xmm6, %xmm1
+; SSE-NEXT: por %xmm2, %xmm1
+; SSE-NEXT: movdqa 128(%rdi), %xmm2
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,7,6,7]
-; SSE-NEXT: movdqa %xmm6, %xmm11
+; SSE-NEXT: movdqa %xmm6, %xmm13
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[2,0]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
-; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: pandn %xmm8, %xmm5
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pandn %xmm15, %xmm1
-; SSE-NEXT: pand %xmm3, %xmm8
-; SSE-NEXT: por %xmm1, %xmm8
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6]
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0]
-; SSE-NEXT: movdqa %xmm1, %xmm10
-; SSE-NEXT: pandn %xmm2, %xmm10
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: por %xmm0, %xmm10
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: pandn %xmm9, %xmm4
-; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm9
-; SSE-NEXT: pandn %xmm12, %xmm9
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: por %xmm9, %xmm0
-; SSE-NEXT: movdqa %xmm7, %xmm13
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6]
-; SSE-NEXT: movdqa %xmm1, %xmm9
-; SSE-NEXT: pandn %xmm7, %xmm9
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: por %xmm0, %xmm9
-; SSE-NEXT: movdqa %xmm3, %xmm7
-; SSE-NEXT: pandn %xmm14, %xmm7
-; SSE-NEXT: movdqa %xmm14, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm14
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; SSE-NEXT: pandn %xmm8, %xmm14
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: por %xmm14, %xmm0
-; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: # xmm6 = mem[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6]
-; SSE-NEXT: movdqa %xmm1, %xmm14
-; SSE-NEXT: pandn %xmm6, %xmm14
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: por %xmm0, %xmm14
-; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT: movdqa %xmm15, %xmm4
+; SSE-NEXT: pandn %xmm0, %xmm4
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm15, %xmm0
; SSE-NEXT: pandn %xmm11, %xmm0
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm11, %xmm6
-; SSE-NEXT: pand %xmm3, %xmm6
-; SSE-NEXT: por %xmm0, %xmm6
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm15, %xmm1
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: # xmm0 = mem[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: pandn %xmm2, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[2,1,2,3,4,5,6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
+; SSE-NEXT: movdqa %xmm0, %xmm7
+; SSE-NEXT: pandn %xmm2, %xmm7
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: por %xmm1, %xmm7
+; SSE-NEXT: movdqa %xmm15, %xmm6
+; SSE-NEXT: pandn %xmm12, %xmm6
+; SSE-NEXT: movdqa %xmm12, %xmm1
+; SSE-NEXT: movdqa %xmm15, %xmm12
+; SSE-NEXT: pandn %xmm10, %xmm12
+; SSE-NEXT: pand %xmm15, %xmm1
+; SSE-NEXT: por %xmm12, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm14[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,4,5,6]
+; SSE-NEXT: movdqa %xmm0, %xmm12
+; SSE-NEXT: pandn %xmm2, %xmm12
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: por %xmm1, %xmm12
+; SSE-NEXT: movdqa %xmm15, %xmm5
+; SSE-NEXT: pandn %xmm9, %xmm5
+; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: movdqa %xmm15, %xmm9
+; SSE-NEXT: pandn %xmm8, %xmm9
+; SSE-NEXT: pand %xmm15, %xmm1
+; SSE-NEXT: por %xmm9, %xmm1
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm10[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,5,6]
+; SSE-NEXT: movdqa %xmm0, %xmm9
+; SSE-NEXT: pandn %xmm2, %xmm9
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: por %xmm1, %xmm9
+; SSE-NEXT: movdqa %xmm15, %xmm1
+; SSE-NEXT: pandn %xmm13, %xmm1
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm11, %xmm2
+; SSE-NEXT: pand %xmm15, %xmm2
+; SSE-NEXT: por %xmm1, %xmm2
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,6]
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pandn %xmm3, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: por %xmm5, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pand %xmm15, %xmm2
+; SSE-NEXT: por %xmm4, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,7,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pandn %xmm2, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,1,2,0]
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: pandn %xmm2, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: por %xmm2, %xmm5
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: por %xmm2, %xmm4
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: por %xmm4, %xmm2
+; SSE-NEXT: pand %xmm15, %xmm2
+; SSE-NEXT: por %xmm6, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,7,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pandn %xmm2, %xmm4
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pandn %xmm2, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: por %xmm2, %xmm4
-; SSE-NEXT: pand %xmm3, %xmm8
-; SSE-NEXT: por %xmm7, %xmm8
-; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: por %xmm2, %xmm3
+; SSE-NEXT: pand %xmm15, %xmm8
+; SSE-NEXT: por %xmm5, %xmm8
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
-; SSE-NEXT: movdqa %xmm1, %xmm6
-; SSE-NEXT: pandn %xmm2, %xmm6
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pandn %xmm2, %xmm5
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: por %xmm2, %xmm6
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: por %xmm2, %xmm5
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pandn %xmm11, %xmm3
-; SSE-NEXT: por %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0]
+; SSE-NEXT: pand %xmm15, %xmm2
+; SSE-NEXT: pandn %xmm11, %xmm15
+; SSE-NEXT: por %xmm2, %xmm15
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
-; SSE-NEXT: pandn %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
-; SSE-NEXT: movaps %xmm2, 32(%rsi)
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
+; SSE-NEXT: pandn %xmm6, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: movaps %xmm2, 32(%rsi)
+; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, (%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 48(%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 16(%rsi)
-; SSE-NEXT: movdqa %xmm0, 32(%rdx)
-; SSE-NEXT: movdqa %xmm14, (%rdx)
-; SSE-NEXT: movdqa %xmm9, 48(%rdx)
-; SSE-NEXT: movdqa %xmm10, 16(%rdx)
-; SSE-NEXT: movdqa %xmm1, 32(%rcx)
-; SSE-NEXT: movdqa %xmm6, (%rcx)
-; SSE-NEXT: movdqa %xmm4, 48(%rcx)
-; SSE-NEXT: movdqa %xmm5, 16(%rcx)
-; SSE-NEXT: addq $72, %rsp
+; SSE-NEXT: movdqa %xmm1, 32(%rdx)
+; SSE-NEXT: movdqa %xmm9, (%rdx)
+; SSE-NEXT: movdqa %xmm12, 48(%rdx)
+; SSE-NEXT: movdqa %xmm7, 16(%rdx)
+; SSE-NEXT: movdqa %xmm0, 32(%rcx)
+; SSE-NEXT: movdqa %xmm5, (%rcx)
+; SSE-NEXT: movdqa %xmm3, 48(%rcx)
+; SSE-NEXT: movdqa %xmm4, 16(%rcx)
+; SSE-NEXT: addq $56, %rsp
; SSE-NEXT: retq
;
; AVX1-LABEL: vf32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa 176(%rdi), %xmm9
-; AVX1-NEXT: vmovdqa 160(%rdi), %xmm11
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa 144(%rdi), %xmm13
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
-; AVX1-NEXT: vmovdqa 112(%rdi), %xmm10
-; AVX1-NEXT: vmovdqa 96(%rdi), %xmm15
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa 128(%rdi), %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa 80(%rdi), %xmm14
-; AVX1-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
-; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm12
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,2,1]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm6[3,4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13>
-; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm12
-; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3,4],xmm12[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7]
-; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15>
-; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm15, %xmm7, %xmm7
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4],xmm0[5,6,7]
+; AVX1-NEXT: pushq %rax
+; AVX1-NEXT: vmovdqa 176(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovdqa 160(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa 144(%rdi), %xmm2
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5,6,7]
+; AVX1-NEXT: vmovdqa 112(%rdi), %xmm6
+; AVX1-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15>
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7]
-; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm4, (%rsi)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm4, 32(%rsi)
-; AVX1-NEXT: vmovdqa %xmm6, 48(%rdx)
-; AVX1-NEXT: vmovdqa %xmm5, 32(%rdx)
-; AVX1-NEXT: vmovdqa %xmm12, (%rdx)
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vmovaps %xmm4, 16(%rdx)
-; AVX1-NEXT: vmovdqa %xmm1, (%rcx)
-; AVX1-NEXT: vmovdqa %xmm3, 16(%rcx)
-; AVX1-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX1-NEXT: vmovdqa 128(%rdi), %xmm9
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,2,1]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm0
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4
+; AVX1-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
+; AVX1-NEXT: vpshufb %xmm8, %xmm11, %xmm12
+; AVX1-NEXT: vmovdqa (%rdi), %xmm11
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm13
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm14
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm15 = xmm8[0,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm11[0],xmm13[1],xmm11[2,3],xmm13[4],xmm11[5,6],xmm13[7]
+; AVX1-NEXT: vpshufb %xmm10, %xmm15, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,2,1]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm15[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm0
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX1-NEXT: vpshufb %xmm10, %xmm12, %xmm12
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm8, %xmm15
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm12[3,4,5,6,7]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm15, %xmm15
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13>
+; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm12
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2,3,4],xmm12[5,6,7]
+; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
+; AVX1-NEXT: vpshufb %xmm1, %xmm12, %xmm1
+; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4],xmm0[5,6,7]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm12[2],xmm0[3,4],xmm12[5],xmm0[6,7]
+; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15>
+; AVX1-NEXT: vpshufb %xmm6, %xmm9, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm7[5,6,7]
+; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm6
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
+; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm7
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3,4],xmm6[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0],xmm0[1],xmm12[2,3],xmm0[4],xmm12[5,6],xmm0[7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15>
+; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5,6,7]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%rsi)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, 32(%rsi)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm15, 32(%rdx)
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vmovaps %xmm0, (%rdx)
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vmovaps %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm6, (%rcx)
+; AVX1-NEXT: vmovdqa %xmm4, 16(%rcx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rcx)
; AVX1-NEXT: vmovdqa %xmm2, 48(%rcx)
+; AVX1-NEXT: popq %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: vf32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm11
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm15
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3
; AVX2-NEXT: vmovdqa 128(%rdi), %ymm4
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
-; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13,14],ymm6[15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm8
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6],ymm5[7],ymm2[8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13,14],ymm5[15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX2-NEXT: vpshufb %ymm8, %ymm2, %ymm2
; AVX2-NEXT: vmovdqa 176(%rdi), %xmm5
; AVX2-NEXT: vmovdqa 160(%rdi), %xmm6
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,4,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm15, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6],ymm9[7],ymm2[8],ymm9[9],ymm2[10,11],ymm9[12],ymm2[13,14],ymm9[15]
-; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm9
+; AVX2-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7
+; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15]
+; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm9
; AVX2-NEXT: vmovdqa 80(%rdi), %xmm7
-; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm7[2],xmm2[3,4],xmm7[5],xmm2[6,7]
-; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8
+; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
+; AVX2-NEXT: vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
-; AVX2-NEXT: vpblendvb %ymm13, %ymm4, %ymm3, %ymm10
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm10
; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
; AVX2-NEXT: vpshufb %ymm12, %ymm10, %ymm10
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-NEXT: vpshufb %xmm14, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm14, %xmm13, %xmm13
+; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpblendvb %ymm13, %ymm15, %ymm11, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7,8,9],ymm13[10],ymm0[11,12],ymm13[13],ymm0[14,15]
-; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm2[2],xmm7[3,4],xmm2[5],xmm7[6,7]
-; AVX2-NEXT: vpshufb %xmm14, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
-; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
+; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15]
+; AVX2-NEXT: vpshufb %ymm12, %ymm11, %ymm11
+; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
+; AVX2-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
@@ -943,20 +949,20 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) noun
; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-NEXT: vpblendvb %ymm1, %ymm15, %ymm11, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7],ymm5[8],ymm1[9,10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15]
-; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6],xmm7[7]
-; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
+; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-NEXT: vmovdqa %ymm9, (%rsi)
-; AVX2-NEXT: vmovdqa %ymm8, 32(%rsi)
+; AVX2-NEXT: vmovdqa %ymm2, 32(%rsi)
; AVX2-NEXT: vmovdqa %ymm10, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX2-NEXT: vmovdqa %ymm11, (%rdx)
; AVX2-NEXT: vmovdqa %ymm3, 32(%rcx)
-; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index d06ab7caec3b6..453c623dba905 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -216,50 +216,50 @@ define void @vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
; SSE-LABEL: vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm10
+; SSE-NEXT: movdqa (%rdi), %xmm2
; SSE-NEXT: movdqa 16(%rdi), %xmm3
-; SSE-NEXT: movdqa 32(%rdi), %xmm8
-; SSE-NEXT: movdqa 48(%rdi), %xmm9
+; SSE-NEXT: movdqa 32(%rdi), %xmm1
+; SSE-NEXT: movdqa 48(%rdi), %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[1,3,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,0,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[0,1,2,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,2,0,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1]
-; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,1,2,0,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
+; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE-NEXT: movapd %xmm0, (%rsi)
-; SSE-NEXT: movapd %xmm2, (%rdx)
-; SSE-NEXT: movapd %xmm6, (%rcx)
-; SSE-NEXT: movapd %xmm3, (%r8)
+; SSE-NEXT: movapd %xmm7, (%rdx)
+; SSE-NEXT: movapd %xmm8, (%rcx)
+; SSE-NEXT: movapd %xmm1, (%r8)
; SSE-NEXT: retq
;
; AVX1-LABEL: vf8:
@@ -275,7 +275,7 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
; AVX1-NEXT: vpackusdw %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm8
+; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
@@ -295,9 +295,9 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
@@ -305,9 +305,9 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm8, (%rsi)
+; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
; AVX1-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX1-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX1-NEXT: vmovdqa %xmm6, (%rcx)
; AVX1-NEXT: vmovdqa %xmm1, (%r8)
; AVX1-NEXT: retq
;
@@ -320,7 +320,7 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm0, %xmm8
+; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3
@@ -344,9 +344,9 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
@@ -354,9 +354,9 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
-; AVX2-SLOW-NEXT: vmovdqa %xmm8, (%rsi)
+; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rcx)
; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%r8)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -370,7 +370,7 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm8
+; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3
@@ -392,9 +392,9 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
@@ -402,9 +402,9 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
-; AVX2-FAST-NEXT: vmovdqa %xmm8, (%rsi)
+; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsi)
; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX2-FAST-NEXT: vmovdqa %xmm6, (%rcx)
; AVX2-FAST-NEXT: vmovdqa %xmm1, (%r8)
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
@@ -444,200 +444,198 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
; SSE-LABEL: vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa 96(%rdi), %xmm7
-; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 96(%rdi), %xmm4
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 64(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 80(%rdi), %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa (%rdi), %xmm13
-; SSE-NEXT: movdqa 16(%rdi), %xmm14
-; SSE-NEXT: movdqa 32(%rdi), %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 48(%rdi), %xmm12
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7]
+; SSE-NEXT: movdqa (%rdi), %xmm9
+; SSE-NEXT: movdqa 16(%rdi), %xmm10
+; SSE-NEXT: movdqa 32(%rdi), %xmm7
+; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 48(%rdi), %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1]
-; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
+; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: movdqa 112(%rdi), %xmm15
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3]
+; SSE-NEXT: movdqa 112(%rdi), %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,3,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[0,1,1,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,3,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,0,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,0,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,2,0,4,5,6,7]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT: # xmm10 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,2,0,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1]
+; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT: # xmm3 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[2,0,2,3,4,5,6,7]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
; SSE-NEXT: # xmm12 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,1,2,0,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT: # xmm13 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[2,0,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[2,0,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
; SSE-NEXT: # xmm14 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[2,0,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT: # xmm2 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,1,2,0,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm13[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm15[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
-; SSE-NEXT: movapd %xmm7, 16(%rsi)
-; SSE-NEXT: movapd %xmm11, (%rsi)
-; SSE-NEXT: movapd %xmm1, 16(%rdx)
+; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[0,1,2,0,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
+; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm13[0],xmm15[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
+; SSE-NEXT: movapd %xmm4, 16(%rsi)
+; SSE-NEXT: movapd %xmm7, (%rsi)
+; SSE-NEXT: movapd %xmm8, 16(%rdx)
; SSE-NEXT: movapd %xmm5, (%rdx)
-; SSE-NEXT: movapd %xmm6, 16(%rcx)
-; SSE-NEXT: movapd %xmm3, (%rcx)
-; SSE-NEXT: movapd %xmm2, 16(%r8)
-; SSE-NEXT: movapd %xmm0, (%r8)
+; SSE-NEXT: movapd %xmm15, 16(%rcx)
+; SSE-NEXT: movapd %xmm6, (%rcx)
+; SSE-NEXT: movapd %xmm3, 16(%r8)
+; SSE-NEXT: movapd %xmm1, (%r8)
; SSE-NEXT: retq
;
; AVX1-LABEL: vf16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa 112(%rdi), %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm2[1,2,3],xmm5[4],xmm2[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7]
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm2[1,2,3],xmm6[4],xmm2[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm1[1,2,3],xmm6[4],xmm1[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovdqa 80(%rdi), %xmm7
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm2[1,2,3],xmm7[4],xmm2[5,6,7]
-; AVX1-NEXT: vmovdqa 64(%rdi), %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa (%rdi), %xmm11
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm12
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm13
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0],xmm2[1,2,3],xmm13[4],xmm2[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm2[1,2,3],xmm12[4],xmm2[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1,2,3],xmm11[4],xmm2[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm9
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm10
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,2,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,2,0,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm1[1,2,3],xmm7[4],xmm1[5,6,7]
+; AVX1-NEXT: vmovdqa 64(%rdi), %xmm8
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm1[1,2,3],xmm8[4],xmm1[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vmovdqa (%rdi), %xmm2
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm9
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0],xmm1[1,2,3],xmm9[4],xmm1[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm10, %xmm11, %xmm10
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm11, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm10, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[0,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm14
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,2,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,2,0,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm13[4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,3,1,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,1,3,1,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm9, (%rsi)
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovaps %xmm1, 16(%rsi)
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi)
; AVX1-NEXT: vmovaps %ymm10, (%rdx)
; AVX1-NEXT: vmovaps %ymm11, (%rcx)
-; AVX1-NEXT: vmovaps %ymm0, (%r8)
+; AVX1-NEXT: vmovaps %ymm2, (%r8)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -663,94 +661,92 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9
-; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11
-; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm12
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5
; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm8
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm10
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,3,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi)
-; AVX2-SLOW-NEXT: vmovdqa %ymm13, (%rdx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8)
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rdx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-ALL-LABEL: vf16:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm11
-; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm12
+; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm2
; AVX2-FAST-ALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm3, %xmm4
@@ -760,68 +756,66 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-ALL-NEXT: vpackusdw %xmm4, %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm3, %ymm4
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm4, %ymm6
-; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm3, %ymm7
+; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm3, %ymm4
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm4, %ymm6
+; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm3, %ymm7
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u>
; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm7, %ymm3
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm13
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm3
; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm6
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm0
-; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm5
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm5, %xmm2
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm0, %xmm1
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm6, %xmm3
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm13, %xmm2
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm4, %ymm3
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm7, %ymm4
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm1, %ymm3
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm3, %ymm7
-; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm1, %ymm8
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,2,0,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm3, %ymm3
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,3,1,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vmovaps %ymm1, (%rsi)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, (%rdx)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r8)
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm9
+; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm10
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm11, %xmm10, %xmm12
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm11, %xmm9, %xmm11
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm6, %xmm13
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm3, %xmm12
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm12, %ymm4, %ymm4
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm13, %ymm7, %ymm7
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm7, %ymm2
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm2, %ymm5
+; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm7, %ymm1
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm1, %ymm7
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,2,0,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,2,0,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,0,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,0,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm12, %ymm2, %ymm2
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm13, %ymm1, %ymm1
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[0,1,3,1,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, (%rdx)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%rcx)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, (%r8)
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX2-FAST-ALL-NEXT: retq
;
@@ -847,80 +841,78 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm9
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm10
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm11
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm12
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm6
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm7
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm13
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm3
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,2,0,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,1,2,0,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm8
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm9
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm10
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm11
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm12
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm13
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm11
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm11
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm9
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,0,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,2,0,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,3,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, (%rdx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rdx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
@@ -961,13 +953,13 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
; SSE-LABEL: vf32:
; SSE: # %bb.0:
-; SSE-NEXT: subq $280, %rsp # imm = 0x118
-; SSE-NEXT: movdqa 224(%rdi), %xmm6
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 192(%rdi), %xmm5
+; SSE-NEXT: subq $296, %rsp # imm = 0x128
+; SSE-NEXT: movdqa 224(%rdi), %xmm11
+; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 192(%rdi), %xmm4
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 208(%rdi), %xmm5
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 208(%rdi), %xmm7
-; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 96(%rdi), %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 112(%rdi), %xmm3
@@ -976,32 +968,31 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 80(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
; SSE-NEXT: movdqa 240(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1]
; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1009,63 +1000,63 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1]
; SSE-NEXT: movdqa 32(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 48(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm1[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1]
; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 128(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 144(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
; SSE-NEXT: movdqa 160(%rdi), %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 176(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
-; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1]
+; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
-; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1]
+; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,1,3,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
@@ -1075,10 +1066,11 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: pshuflw $237, (%rsp), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1]
+; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm6[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1095,293 +1087,293 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: # xmm3 = mem[3,1,2,3]
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[0,1,2,0,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; SSE-NEXT: # xmm9 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[0,1,2,0,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
; SSE-NEXT: # xmm8 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,0,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: # xmm4 = mem[3,1,2,3]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE-NEXT: # xmm7 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[2,0,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; SSE-NEXT: # xmm5 = mem[3,1,2,3]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; SSE-NEXT: # xmm3 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,1,2,0,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
+; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[3,1,2,3]
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,0,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,0,2,3,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,0,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
; SSE-NEXT: # xmm14 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,2,0,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm14[0,1,2,0,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
-; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm6[0],xmm10[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,2,0,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm14[0,1,2,0,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
+; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
; SSE-NEXT: # xmm13 = mem[3,1,2,3]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; SSE-NEXT: # xmm12 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[2,0,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; SSE-NEXT: # xmm11 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[2,0,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,0,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
; SSE-NEXT: # xmm6 = mem[3,1,2,3]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; SSE-NEXT: # xmm5 = mem[3,1,2,3]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT: # xmm4 = mem[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm5[0,1,2,0,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[0,1,2,0,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm13[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, 32(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, (%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, 48(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, 16(%rsi)
-; SSE-NEXT: movapd %xmm7, (%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, 32(%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, 48(%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, 16(%rdx)
+; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 32(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, (%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 48(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 16(%rsi)
+; SSE-NEXT: movapd %xmm12, (%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 32(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 48(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 16(%rdx)
; SSE-NEXT: movapd %xmm15, 32(%rcx)
-; SSE-NEXT: movapd %xmm10, (%rcx)
+; SSE-NEXT: movapd %xmm9, (%rcx)
; SSE-NEXT: movapd %xmm2, 16(%rcx)
-; SSE-NEXT: movapd %xmm11, 48(%rcx)
-; SSE-NEXT: movapd %xmm5, 32(%r8)
-; SSE-NEXT: movapd %xmm0, (%r8)
+; SSE-NEXT: movapd %xmm10, 48(%rcx)
+; SSE-NEXT: movapd %xmm4, 32(%r8)
+; SSE-NEXT: movapd %xmm7, (%r8)
; SSE-NEXT: movapd %xmm1, 48(%r8)
-; SSE-NEXT: movapd %xmm4, 16(%r8)
-; SSE-NEXT: addq $280, %rsp # imm = 0x118
+; SSE-NEXT: movapd %xmm3, 16(%r8)
+; SSE-NEXT: addq $296, %rsp # imm = 0x128
; SSE-NEXT: retq
;
; AVX1-LABEL: vf32:
; AVX1: # %bb.0:
; AVX1-NEXT: subq $200, %rsp
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX1-NEXT: vmovdqa 240(%rdi), %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
-; AVX1-NEXT: vmovdqa %xmm1, %xmm12
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7]
+; AVX1-NEXT: vmovdqa %xmm1, %xmm9
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovdqa 224(%rdi), %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
-; AVX1-NEXT: vmovdqa %xmm2, %xmm14
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7]
+; AVX1-NEXT: vmovdqa %xmm2, %xmm10
; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqa 208(%rdi), %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
-; AVX1-NEXT: vmovdqa %xmm2, %xmm15
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7]
+; AVX1-NEXT: vmovdqa %xmm2, %xmm11
; AVX1-NEXT: vmovdqa 192(%rdi), %xmm2
-; AVX1-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7]
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovdqa 176(%rdi), %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7]
; AVX1-NEXT: vmovdqa 160(%rdi), %xmm1
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7]
; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqa 144(%rdi), %xmm1
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
-; AVX1-NEXT: vmovdqa 128(%rdi), %xmm2
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7]
+; AVX1-NEXT: vmovdqa 128(%rdi), %xmm12
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm6[1,2,3],xmm12[4],xmm6[5,6,7]
+; AVX1-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa 112(%rdi), %xmm8
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm4[1,2,3],xmm8[4],xmm4[5,6,7]
+; AVX1-NEXT: vmovdqa 112(%rdi), %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm6[1,2,3],xmm4[4],xmm6[5,6,7]
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm4[1,2,3],xmm5[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6,7]
; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqa 80(%rdi), %xmm9
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm4[1,2,3],xmm9[4],xmm4[5,6,7]
+; AVX1-NEXT: vmovdqa 80(%rdi), %xmm8
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7]
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm10
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm4[1,2,3],xmm10[4],xmm4[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm7, %xmm13
-; AVX1-NEXT: vmovdqa (%rdi), %xmm11
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm15
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm6[1,2,3],xmm15[4],xmm6[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm7, %xmm7
+; AVX1-NEXT: vmovdqa (%rdi), %xmm14
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3],xmm11[4],xmm4[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpackusdw %xmm13, %xmm4, %xmm2
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0],xmm6[1,2,3],xmm14[4],xmm6[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpackusdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm14, %xmm12
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX1-NEXT: vmovdqa %xmm15, %xmm14
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX1-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; AVX1-NEXT: vmovdqa %xmm10, %xmm9
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; AVX1-NEXT: vmovdqa %xmm11, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
; AVX1-NEXT: # xmm6 = mem[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX1-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm6 = mem[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
; AVX1-NEXT: # xmm7 = mem[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3]
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm7 = mem[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,1,2,3]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,2,0,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3]
; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,1,2,3]
; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,1,2,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,1,2,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[3,1,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm10 = mem[3,1,2,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm8 = mem[3,1,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[3,1,2,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[2,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[2,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[3,1,2,3]
-; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm15 = mem[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm12 = mem[3,1,2,3]
+; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm13 = mem[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; AVX1-NEXT: # xmm3 = mem[3,1,2,3]
-; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: vpshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: # xmm2 = mem[3,1,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
; AVX1-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -1393,8 +1385,8 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX1-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,3,1,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,3,1,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
@@ -1402,15 +1394,15 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,3,1,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,3,1,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,3,1,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,3,1,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
@@ -1430,7 +1422,7 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX1-NEXT: vmovaps %ymm2, 32(%rdx)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm2, (%rdx)
-; AVX1-NEXT: vmovaps %ymm13, 32(%rcx)
+; AVX1-NEXT: vmovaps %ymm14, 32(%rcx)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm2, (%rcx)
; AVX1-NEXT: vmovaps %ymm1, 32(%r8)
@@ -1441,7 +1433,7 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
;
; AVX2-SLOW-LABEL: vf32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: subq $200, %rsp
+; AVX2-SLOW-NEXT: subq $184, %rsp
; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
@@ -1484,33 +1476,34 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12
-; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm15
-; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm11
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7
+; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm10
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm9
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
@@ -1518,20 +1511,18 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0
-; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm3
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1
-; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm13
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -1548,58 +1539,57 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[1,3,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3]
; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3]
; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3]
; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,0,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm9 # 16-byte Folded Reload
+; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX2-SLOW-NEXT: # xmm12 = mem[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm10 # 16-byte Folded Reload
-; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -1633,19 +1623,19 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,3,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,3,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[3,1,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[3,1,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -1671,19 +1661,19 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx)
; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8)
; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8)
-; AVX2-SLOW-NEXT: addq $200, %rsp
+; AVX2-SLOW-NEXT: addq $184, %rsp
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-ALL-LABEL: vf32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: subq $200, %rsp
-; AVX2-FAST-ALL-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX2-FAST-ALL-NEXT: subq $136, %rsp
+; AVX2-FAST-ALL-NEXT: vmovdqa 192(%rdi), %ymm5
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vmovdqa 224(%rdi), %ymm6
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 224(%rdi), %ymm7
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm5
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm4
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
@@ -1694,155 +1684,149 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-FAST-ALL-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
; AVX2-FAST-ALL-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm3, %ymm10
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm2, %ymm10, %ymm4
-; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm3, %ymm8
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm9, %ymm8, %ymm5
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, %ymm11
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm2, %ymm7
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm4, %ymm7, %ymm3
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, %ymm8
+; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm2, %ymm12
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm12, %ymm4
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-FAST-ALL-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-FAST-ALL-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-FAST-ALL-NEXT: vpackusdw %xmm4, %xmm0, %xmm0
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-FAST-ALL-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm3, %ymm15
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm2, %ymm15, %ymm1
-; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm3, %ymm9
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm9, %ymm4
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm2, %ymm15
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm15, %ymm1
+; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm2, %ymm4
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm4, %ymm2
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm6, %xmm2, %xmm1
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm6, %xmm12, %xmm4
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm9
+; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm3, %xmm1
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm9, %xmm2
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm11
-; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm3, %xmm4
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm11, %xmm7
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm10, %ymm5
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm13, %ymm8, %ymm7
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm5, %xmm6
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm11, %xmm10
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm7, %ymm6
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, %ymm10
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm12, %ymm7
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7]
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 160(%rdi), %xmm0
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 176(%rdi), %xmm4
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm6, %xmm4, %xmm7
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm6, %xmm0, %xmm6
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-FAST-ALL-NEXT: vmovdqa 128(%rdi), %xmm5
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 144(%rdi), %xmm10
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm10, %xmm0
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm5, %xmm1
+; AVX2-FAST-ALL-NEXT: vmovdqa 160(%rdi), %xmm13
+; AVX2-FAST-ALL-NEXT: vmovdqa 176(%rdi), %xmm6
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm6, %xmm2
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm13, %xmm12
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
+; AVX2-FAST-ALL-NEXT: vmovdqa 128(%rdi), %xmm12
+; AVX2-FAST-ALL-NEXT: vmovdqa 144(%rdi), %xmm14
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm14, %xmm0
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm12, %xmm1
; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3]
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm15, %ymm1
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm13, %ymm9, %ymm8
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm15, %ymm1
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm4, %ymm2
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7]
-; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpermd (%rsp), %ymm1, %ymm15 # 32-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm9 = ymm15[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,2,0,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,1,2,0,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
+; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[2,0,2,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,1,2,0,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,1,2,0,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload
+; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm9, %ymm5
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: # xmm2 = mem[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: # xmm7 = mem[3,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,0,2,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[2,0,2,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm7, %ymm11
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm15 = xmm6[0,1,2,0,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
-; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm0, %ymm5
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,3,1,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,1,3,1,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,1,2,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm9, %ymm3
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm2, %ymm2
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm8, %ymm8
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm7, %ymm3
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm1, %ymm1
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,1,2,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,3,1,4,5,6,7]
; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rsi)
-; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rsi)
-; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rdx)
-; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rdx)
-; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rcx)
-; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rcx)
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vmovaps %ymm3, 32(%rsi)
+; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vmovaps %ymm3, (%rsi)
+; AVX2-FAST-ALL-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vmovaps %ymm3, 32(%rdx)
+; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vmovaps %ymm3, (%rdx)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, (%rcx)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 32(%r8)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r8)
-; AVX2-FAST-ALL-NEXT: addq $200, %rsp
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, (%r8)
+; AVX2-FAST-ALL-NEXT: addq $136, %rsp
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX2-FAST-ALL-NEXT: retq
;
; AVX2-FAST-PERLANE-LABEL: vf32:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: subq $184, %rsp
+; AVX2-FAST-PERLANE-NEXT: subq $168, %rsp
; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2
@@ -1884,161 +1868,159 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm13
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm15
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm11
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm12
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm9
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm10
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm3
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm3
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm3
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm2
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm13
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm14
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm3
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm9
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm9
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm6
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm3
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm3
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm4
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm3
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm12
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm15
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm11
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm15
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm12
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm8
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm12
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,0,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,1,2,0,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm1 # 16-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,3,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,3,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,1,3,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,3,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
@@ -2051,12 +2033,12 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx)
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 32(%rcx)
+; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8)
-; AVX2-FAST-PERLANE-NEXT: addq $184, %rsp
+; AVX2-FAST-PERLANE-NEXT: addq $168, %rsp
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index f0a48c0496387..1d6755351a10f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -316,104 +316,102 @@ define void @vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
; SSE-LABEL: vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa 64(%rdi), %xmm11
-; SSE-NEXT: movdqa (%rdi), %xmm14
-; SSE-NEXT: movdqa 16(%rdi), %xmm15
-; SSE-NEXT: movdqa 32(%rdi), %xmm8
-; SSE-NEXT: movdqa 48(%rdi), %xmm12
+; SSE-NEXT: movdqa 64(%rdi), %xmm5
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movdqa 32(%rdi), %xmm0
+; SSE-NEXT: movdqa 48(%rdi), %xmm6
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,0,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,3]
; SSE-NEXT: pand %xmm3, %xmm4
-; SSE-NEXT: pandn %xmm8, %xmm3
+; SSE-NEXT: pandn %xmm0, %xmm3
; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[3,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3]
-; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0]
-; SSE-NEXT: andps %xmm10, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,1,0,1]
-; SSE-NEXT: movaps %xmm10, %xmm9
-; SSE-NEXT: pandn %xmm4, %xmm9
-; SSE-NEXT: por %xmm7, %xmm9
-; SSE-NEXT: movdqa %xmm15, %xmm4
-; SSE-NEXT: psrlq $48, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[1,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,65535,65535,65535,65535,65535]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
-; SSE-NEXT: pand %xmm7, %xmm4
-; SSE-NEXT: pandn %xmm5, %xmm7
-; SSE-NEXT: por %xmm4, %xmm7
-; SSE-NEXT: pand %xmm10, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,2,0]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3]
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,1,1,3]
-; SSE-NEXT: psllq $48, %xmm11
-; SSE-NEXT: pandn %xmm11, %xmm10
-; SSE-NEXT: por %xmm7, %xmm10
-; SSE-NEXT: movdqa %xmm12, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[0,2,2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm8[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm8[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0,1,3]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,1,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm15[2],xmm4[3],xmm15[3]
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm15[3,0]
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[0,2]
+; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
+; SSE-NEXT: andps %xmm3, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,0,1]
+; SSE-NEXT: movaps %xmm3, %xmm4
+; SSE-NEXT: pandn %xmm8, %xmm4
+; SSE-NEXT: por %xmm7, %xmm4
; SSE-NEXT: movdqa %xmm2, %xmm7
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: pandn %xmm12, %xmm2
-; SSE-NEXT: por %xmm2, %xmm4
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pandn %xmm0, %xmm6
-; SSE-NEXT: por %xmm6, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[3,0]
+; SSE-NEXT: psrlq $48, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,65535,65535,65535,65535,65535]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,4,7]
+; SSE-NEXT: pand %xmm7, %xmm9
; SSE-NEXT: pandn %xmm8, %xmm7
-; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[0,2]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,4,6,7]
-; SSE-NEXT: pshufhw $148, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,6]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
+; SSE-NEXT: por %xmm9, %xmm7
+; SSE-NEXT: pand %xmm3, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,1,0,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,1,1,3]
+; SSE-NEXT: psllq $48, %xmm5
+; SSE-NEXT: pandn %xmm5, %xmm3
; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5,4,7]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
-; SSE-NEXT: movdqa %xmm9, (%rsi)
-; SSE-NEXT: movdqa %xmm10, (%rdx)
-; SSE-NEXT: movaps %xmm4, (%rcx)
-; SSE-NEXT: movaps %xmm1, (%r8)
-; SSE-NEXT: movaps %xmm3, (%r9)
+; SSE-NEXT: movdqa %xmm6, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,2,2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0,1,3]
+; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,1,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3]
+; SSE-NEXT: pand %xmm11, %xmm12
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; SSE-NEXT: movdqa %xmm11, %xmm13
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,0,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm11, %xmm14
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; SSE-NEXT: movdqa %xmm11, %xmm1
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm11, %xmm2
+; SSE-NEXT: pandn %xmm6, %xmm11
+; SSE-NEXT: por %xmm11, %xmm12
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm6[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,2,2,2,4,5,6,7]
+; SSE-NEXT: pandn %xmm6, %xmm13
+; SSE-NEXT: por %xmm13, %xmm14
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[3,0]
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,5,5,6]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,0]
+; SSE-NEXT: por %xmm1, %xmm2
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,5,4,7]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSE-NEXT: movdqa %xmm4, (%rsi)
+; SSE-NEXT: movdqa %xmm3, (%rdx)
+; SSE-NEXT: movaps %xmm12, (%rcx)
+; SSE-NEXT: movaps %xmm14, (%r8)
+; SSE-NEXT: movaps %xmm2, (%r9)
; SSE-NEXT: retq
;
; AVX1-LABEL: vf8:
@@ -432,7 +430,7 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,5,6],xmm6[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm6[7]
; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
@@ -441,25 +439,25 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,0,1,10,11,4,5,14,15,u,u]
; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4,5,6,7]
; AVX1-NEXT: vpsllq $48, %xmm5, %xmm7
-; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,5,6],xmm7[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,1,3]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3,4,5],xmm7[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,2,0]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm7[6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4,5],xmm7[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,1,2,0]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4,5],xmm8[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
@@ -470,10 +468,10 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,1,3]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-NEXT: vmovdqa %xmm8, (%rsi)
-; AVX1-NEXT: vmovdqa %xmm9, (%rdx)
-; AVX1-NEXT: vmovdqa %xmm4, (%rcx)
-; AVX1-NEXT: vmovdqa %xmm6, (%r8)
+; AVX1-NEXT: vmovdqa %xmm4, (%rsi)
+; AVX1-NEXT: vmovdqa %xmm6, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm7, (%rcx)
+; AVX1-NEXT: vmovdqa %xmm8, (%r8)
; AVX1-NEXT: vmovdqa %xmm0, (%r9)
; AVX1-NEXT: retq
;
@@ -655,348 +653,354 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
; SSE-LABEL: vf16:
; SSE: # %bb.0:
-; SSE-NEXT: subq $24, %rsp
-; SSE-NEXT: movdqa 144(%rdi), %xmm12
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: movdqa 144(%rdi), %xmm13
; SSE-NEXT: movdqa 80(%rdi), %xmm5
-; SSE-NEXT: movdqa 96(%rdi), %xmm10
-; SSE-NEXT: movdqa 128(%rdi), %xmm8
-; SSE-NEXT: movdqa 112(%rdi), %xmm14
-; SSE-NEXT: movdqa 64(%rdi), %xmm7
-; SSE-NEXT: movdqa (%rdi), %xmm6
-; SSE-NEXT: movdqa 16(%rdi), %xmm9
-; SSE-NEXT: movdqa 32(%rdi), %xmm4
-; SSE-NEXT: movdqa 48(%rdi), %xmm3
+; SSE-NEXT: movdqa 96(%rdi), %xmm3
+; SSE-NEXT: movdqa 128(%rdi), %xmm6
+; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 112(%rdi), %xmm9
+; SSE-NEXT: movdqa 64(%rdi), %xmm12
+; SSE-NEXT: movdqa (%rdi), %xmm7
+; SSE-NEXT: movdqa 16(%rdi), %xmm8
+; SSE-NEXT: movdqa 32(%rdi), %xmm11
+; SSE-NEXT: movdqa 48(%rdi), %xmm4
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm4, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
-; SSE-NEXT: movdqa %xmm3, %xmm11
+; SSE-NEXT: pandn %xmm11, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3]
+; SSE-NEXT: movdqa %xmm4, %xmm14
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3]
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
-; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0]
-; SSE-NEXT: andps %xmm13, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
-; SSE-NEXT: movaps %xmm13, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
+; SSE-NEXT: movdqa %xmm7, %xmm10
+; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
+; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0]
+; SSE-NEXT: andps %xmm7, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1]
+; SSE-NEXT: movdqa %xmm12, %xmm15
+; SSE-NEXT: movaps %xmm7, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3]
+; SSE-NEXT: por %xmm4, %xmm2
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,3]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm14, %xmm0
+; SSE-NEXT: pandn %xmm9, %xmm0
+; SSE-NEXT: movdqa %xmm9, %xmm6
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
+; SSE-NEXT: movdqa %xmm3, %xmm9
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
-; SSE-NEXT: andps %xmm13, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,1]
-; SSE-NEXT: movdqa %xmm12, %xmm15
-; SSE-NEXT: movaps %xmm13, %xmm1
+; SSE-NEXT: andps %xmm7, %xmm2
+; SSE-NEXT: movdqa %xmm13, %xmm12
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,1]
+; SSE-NEXT: movaps %xmm7, %xmm1
; SSE-NEXT: andnps %xmm0, %xmm1
; SSE-NEXT: orps %xmm2, %xmm1
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm9, %xmm0
+; SSE-NEXT: movdqa %xmm8, %xmm0
; SSE-NEXT: psrlq $48, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3]
+; SSE-NEXT: movdqa %xmm14, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm7, %xmm2
-; SSE-NEXT: movdqa %xmm7, %xmm12
-; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm15, %xmm14
+; SSE-NEXT: movdqa %xmm15, %xmm2
; SSE-NEXT: psllq $48, %xmm2
-; SSE-NEXT: movaps %xmm13, %xmm3
-; SSE-NEXT: andnps %xmm2, %xmm3
-; SSE-NEXT: pand %xmm13, %xmm1
-; SSE-NEXT: orps %xmm1, %xmm3
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm10, %xmm1
+; SSE-NEXT: movaps %xmm7, %xmm13
+; SSE-NEXT: andnps %xmm2, %xmm13
+; SSE-NEXT: pand %xmm7, %xmm1
+; SSE-NEXT: orps %xmm1, %xmm13
+; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: movdqa %xmm9, %xmm10
+; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: psrlq $48, %xmm1
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: movdqa %xmm8, %xmm5
-; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pandn %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pand %xmm13, %xmm0
-; SSE-NEXT: movdqa %xmm15, %xmm8
-; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm15, %xmm1
+; SSE-NEXT: pand %xmm7, %xmm0
+; SSE-NEXT: movdqa %xmm12, %xmm4
+; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm12, %xmm1
; SSE-NEXT: psllq $48, %xmm1
-; SSE-NEXT: pandn %xmm1, %xmm13
-; SSE-NEXT: por %xmm0, %xmm13
-; SSE-NEXT: movdqa %xmm11, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3]
+; SSE-NEXT: pandn %xmm1, %xmm7
+; SSE-NEXT: por %xmm0, %xmm7
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm9
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT: movaps %xmm1, %xmm2
-; SSE-NEXT: andnps %xmm0, %xmm2
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,4,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; SSE-NEXT: pand %xmm1, %xmm15
-; SSE-NEXT: por %xmm2, %xmm15
+; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT: movaps %xmm3, %xmm1
+; SSE-NEXT: andnps %xmm0, %xmm1
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm2[0,1,2,3,4,7,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3]
+; SSE-NEXT: pand %xmm3, %xmm15
+; SSE-NEXT: por %xmm1, %xmm15
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm2[2,0]
-; SSE-NEXT: movdqa %xmm5, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm14[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm14[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,1,3]
-; SSE-NEXT: movaps %xmm1, %xmm3
-; SSE-NEXT: andnps %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,0]
+; SSE-NEXT: movdqa %xmm5, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
+; SSE-NEXT: movaps %xmm3, %xmm2
+; SSE-NEXT: andnps %xmm1, %xmm2
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: por %xmm3, %xmm0
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pandn %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,1,1]
-; SSE-NEXT: movdqa %xmm7, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[3,0]
-; SSE-NEXT: movdqa %xmm1, %xmm8
-; SSE-NEXT: pandn %xmm4, %xmm8
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm11[0,2]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm4[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[2,2,2,2,4,5,6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pandn %xmm4, %xmm5
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: pandn %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm12, %xmm10
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: por %xmm2, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm11[3,0]
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: pandn %xmm11, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[0,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,7,4,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[0,1,0,3]
+; SSE-NEXT: movdqa %xmm14, %xmm12
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm4[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[2,2,2,2,4,5,6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm14
+; SSE-NEXT: pandn %xmm4, %xmm14
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm4
-; SSE-NEXT: por %xmm5, %xmm4
-; SSE-NEXT: movdqa %xmm6, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm9[3,0]
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm10[3,0]
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[0,2]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT: movaps %xmm11, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm14[3,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm7
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm6
-; SSE-NEXT: pandn %xmm14, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[0,2]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,7,4,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[1,0,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm3, %xmm11
+; SSE-NEXT: por %xmm14, %xmm11
+; SSE-NEXT: movdqa %xmm10, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[3,0]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[0,2]
+; SSE-NEXT: movdqa %xmm5, %xmm4
+; SSE-NEXT: movdqa %xmm9, %xmm5
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm9[3,0]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; SSE-NEXT: movaps %xmm10, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[3,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm3, %xmm8
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm3, %xmm5
+; SSE-NEXT: pandn %xmm6, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,4,6,7]
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm5[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
-; SSE-NEXT: por %xmm8, %xmm7
-; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; SSE-NEXT: # xmm3 = mem[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0]
-; SSE-NEXT: por %xmm6, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, 16(%rsi)
-; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, (%rsi)
-; SSE-NEXT: movdqa %xmm13, 16(%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, (%rdx)
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,0]
+; SSE-NEXT: por %xmm2, %xmm8
+; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0]
+; SSE-NEXT: por %xmm5, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: movaps %xmm2, 16(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: movaps %xmm2, (%rsi)
+; SSE-NEXT: movdqa %xmm7, 16(%rdx)
+; SSE-NEXT: movaps %xmm13, (%rdx)
; SSE-NEXT: movaps %xmm0, 16(%rcx)
; SSE-NEXT: movaps %xmm15, (%rcx)
-; SSE-NEXT: movaps %xmm4, 16(%r8)
-; SSE-NEXT: movaps %xmm2, (%r8)
-; SSE-NEXT: movaps %xmm1, 16(%r9)
-; SSE-NEXT: movaps %xmm7, (%r9)
-; SSE-NEXT: addq $24, %rsp
+; SSE-NEXT: movaps %xmm11, 16(%r8)
+; SSE-NEXT: movaps %xmm1, (%r8)
+; SSE-NEXT: movaps %xmm3, 16(%r9)
+; SSE-NEXT: movaps %xmm8, (%r9)
+; SSE-NEXT: popq %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: vf16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa 96(%rdi), %xmm8
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,1,1,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
-; AVX1-NEXT: vmovdqa 112(%rdi), %xmm9
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1]
-; AVX1-NEXT: vmovdqa 80(%rdi), %xmm13
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; AVX1-NEXT: vmovdqa 144(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,7]
+; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm2[1],xmm1[1]
+; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7]
+; AVX1-NEXT: vmovdqa 144(%rdi), %xmm8
; AVX1-NEXT: vmovdqa 128(%rdi), %xmm7
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm0[2,3],xmm7[4,5,6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm3[5,6,7]
-; AVX1-NEXT: vmovdqa (%rdi), %xmm14
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm15
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4],xmm4[5,6,7]
+; AVX1-NEXT: vmovdqa (%rdi), %xmm3
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm5
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,0,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm1, %ymm12, %ymm3
-; AVX1-NEXT: vmovaps 64(%rdi), %xmm1
-; AVX1-NEXT: vpermilps {{.*#+}} xmm11 = xmm1[0,1,0,1]
-; AVX1-NEXT: vandnps %ymm11, %ymm12, %ymm11
-; AVX1-NEXT: vorps %ymm3, %ymm11, %ymm3
-; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm10
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm0[4,5],xmm7[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2,3,4],xmm3[5,6,7]
-; AVX1-NEXT: vpsllq $48, %xmm1, %xmm3
-; AVX1-NEXT: vandnps %ymm3, %ymm12, %ymm3
-; AVX1-NEXT: vpsrlq $48, %xmm15, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,1,10,11,4,5,14,15,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7]
-; AVX1-NEXT: vandps %ymm2, %ymm12, %ymm2
-; AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm11
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm8[4,5],xmm9[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,1,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,2,0]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm12
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u]
-; AVX1-NEXT: vpsrlq $48, %xmm13, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm7[4,5],xmm0[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm14[2,3],xmm15[4,5,6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2,3],xmm14[4,5],xmm15[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm5[4],xmm9[5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
+; AVX1-NEXT: vandps %ymm11, %ymm9, %ymm12
+; AVX1-NEXT: vmovaps 64(%rdi), %xmm9
+; AVX1-NEXT: vpermilps {{.*#+}} xmm13 = xmm9[0,1,0,1]
+; AVX1-NEXT: vandnps %ymm13, %ymm11, %ymm13
+; AVX1-NEXT: vorps %ymm13, %ymm12, %ymm12
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,0,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
+; AVX1-NEXT: vpsllq $48, %xmm9, %xmm13
+; AVX1-NEXT: vandnps %ymm13, %ymm11, %ymm13
+; AVX1-NEXT: vpsrlq $48, %xmm4, %xmm14
+; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,0,1,10,11,4,5,14,15,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
+; AVX1-NEXT: vandps %ymm11, %ymm14, %ymm11
+; AVX1-NEXT: vorps %ymm13, %ymm11, %ymm11
+; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,1,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5],xmm13[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,0]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
+; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u]
+; AVX1-NEXT: vpsrlq $48, %xmm2, %xmm14
+; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm7[4,5],xmm8[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,1,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovaps %ymm10, (%rsi)
; AVX1-NEXT: vmovaps %ymm11, (%rdx)
; AVX1-NEXT: vmovaps %ymm12, (%rcx)
-; AVX1-NEXT: vmovaps %ymm2, (%r8)
+; AVX1-NEXT: vmovaps %ymm13, (%r8)
; AVX1-NEXT: vmovaps %ymm0, (%r9)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1015,57 +1019,57 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm5
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5
; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6
; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm5
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6],xmm5[7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5],ymm7[6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm6[2],xmm4[3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm4[2],xmm6[3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0],xmm6[1],xmm4[2,3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
+; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
@@ -1082,10 +1086,10 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rsi)
-; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rdx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rsi)
+; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rdx)
; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r8)
+; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8)
; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -1109,55 +1113,55 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,0,3,5,u>
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm6, %ymm6
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm6, %ymm6
; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm6, %xmm5
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6],xmm5[7]
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm6, %xmm9
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6],xmm9[7]
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,7,1,6>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm10, %ymm6
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,1,3,6,u>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6
+; AVX2-FAST-ALL-NEXT: vpermd %ymm9, %ymm10, %ymm9
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm6, %ymm9, %ymm6
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,1,3,6,u>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm8, %ymm9
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm6, %ymm6
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15]
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,2,u,u,5,7,2,4>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm10, %ymm7
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,4,6,0,1,4,6,0]
-; AVX2-FAST-ALL-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm7, %ymm7
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm7, %ymm7
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3]
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm10, %ymm6
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0]
-; AVX2-FAST-ALL-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm6, %ymm6
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm9, %ymm9
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15]
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <0,2,u,u,5,7,2,4>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm11, %ymm10
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0]
+; AVX2-FAST-ALL-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm10, %ymm10
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm10, %ymm7
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm11, %ymm10
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0]
+; AVX2-FAST-ALL-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm10, %ymm10
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm10, %ymm8
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
@@ -1171,10 +1175,10 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, (%rsi)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm12, (%rdx)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%rsi)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, (%rdx)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm7, (%rcx)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%r8)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm8, (%r8)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r9)
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX2-FAST-ALL-NEXT: retq
@@ -1193,57 +1197,57 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm6
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm6
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6],xmm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5],ymm7[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2],xmm5[3]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm5[1],xmm4[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm4[0,1],xmm5[2],xmm4[3]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
@@ -1258,10 +1262,10 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rsi)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rdx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%rsi)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rdx)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%r8)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
@@ -1318,17 +1322,18 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
; SSE-LABEL: vf32:
; SSE: # %bb.0:
-; SSE-NEXT: subq $408, %rsp # imm = 0x198
-; SSE-NEXT: movdqa 304(%rdi), %xmm9
+; SSE-NEXT: subq $424, %rsp # imm = 0x1A8
+; SSE-NEXT: movdqa 304(%rdi), %xmm4
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 240(%rdi), %xmm8
; SSE-NEXT: movdqa 256(%rdi), %xmm12
-; SSE-NEXT: movdqa 288(%rdi), %xmm14
-; SSE-NEXT: movdqa 272(%rdi), %xmm5
-; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill
-; SSE-NEXT: movdqa 144(%rdi), %xmm6
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 80(%rdi), %xmm13
-; SSE-NEXT: movdqa 96(%rdi), %xmm10
+; SSE-NEXT: movdqa 288(%rdi), %xmm6
+; SSE-NEXT: movdqa 272(%rdi), %xmm11
+; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill
+; SSE-NEXT: movdqa 144(%rdi), %xmm7
+; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 80(%rdi), %xmm5
+; SSE-NEXT: movdqa 96(%rdi), %xmm9
; SSE-NEXT: movdqa 128(%rdi), %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 112(%rdi), %xmm2
@@ -1339,28 +1344,29 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
-; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3]
+; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3]
-; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
-; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0]
-; SSE-NEXT: andps %xmm7, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1]
-; SSE-NEXT: movaps %xmm7, %xmm2
+; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0]
+; SSE-NEXT: andps %xmm10, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
+; SSE-NEXT: movaps %xmm10, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: por %xmm3, %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3]
+; SSE-NEXT: pandn %xmm11, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,1,0,3]
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3]
-; SSE-NEXT: movdqa %xmm12, %xmm5
; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3]
@@ -1368,12 +1374,12 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1]
-; SSE-NEXT: movaps %xmm7, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
+; SSE-NEXT: movaps %xmm10, %xmm2
; SSE-NEXT: andnps %xmm1, %xmm2
; SSE-NEXT: movdqa 32(%rdi), %xmm4
; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: andps %xmm7, %xmm3
+; SSE-NEXT: andps %xmm10, %xmm3
; SSE-NEXT: orps %xmm3, %xmm2
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, %xmm1
@@ -1383,35 +1389,37 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: movdqa 16(%rdi), %xmm11
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3]
-; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 16(%rdi), %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
+; SSE-NEXT: movdqa %xmm3, %xmm6
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa (%rdi), %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3]
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa (%rdi), %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
; SSE-NEXT: movdqa 64(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE-NEXT: movaps %xmm7, %xmm2
+; SSE-NEXT: movaps %xmm10, %xmm2
; SSE-NEXT: andnps %xmm1, %xmm2
-; SSE-NEXT: andps %xmm7, %xmm3
+; SSE-NEXT: andps %xmm10, %xmm3
; SSE-NEXT: orps %xmm3, %xmm2
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 208(%rdi), %xmm1
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE-NEXT: movdqa 208(%rdi), %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
+; SSE-NEXT: movdqa %xmm2, %xmm14
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movdqa 192(%rdi), %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pandn %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movdqa 176(%rdi), %xmm1
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT: movdqa 176(%rdi), %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSE-NEXT: movdqa %xmm2, %xmm13
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
; SSE-NEXT: movdqa 160(%rdi), %xmm15
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,2,2,3]
@@ -1422,22 +1430,22 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: movdqa 224(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT: movaps %xmm7, %xmm1
+; SSE-NEXT: movaps %xmm10, %xmm1
; SSE-NEXT: andnps %xmm0, %xmm1
-; SSE-NEXT: andps %xmm7, %xmm2
+; SSE-NEXT: andps %xmm10, %xmm2
; SSE-NEXT: orps %xmm2, %xmm1
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: psrlq $48, %xmm10
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,3,2,3]
+; SSE-NEXT: psrlq $48, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,3,2,3]
+; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT: # xmm3 = mem[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
@@ -1445,24 +1453,22 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm12, %xmm2
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm11, %xmm2
; SSE-NEXT: psllq $48, %xmm2
-; SSE-NEXT: movaps %xmm7, %xmm3
+; SSE-NEXT: movaps %xmm10, %xmm3
; SSE-NEXT: andnps %xmm2, %xmm3
-; SSE-NEXT: pand %xmm7, %xmm1
+; SSE-NEXT: pand %xmm10, %xmm1
; SSE-NEXT: orps %xmm1, %xmm3
; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm5, %xmm1
+; SSE-NEXT: movdqa %xmm12, %xmm1
; SSE-NEXT: psrlq $48, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pandn %xmm2, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3]
-; SSE-NEXT: movdqa %xmm14, %xmm13
-; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3]
; SSE-NEXT: pshufd $232, (%rsp), %xmm3 # 16-byte Folded Reload
; SSE-NEXT: # xmm3 = mem[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
@@ -1472,22 +1478,23 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7]
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm8, %xmm1
; SSE-NEXT: psllq $48, %xmm1
-; SSE-NEXT: movdqa %xmm7, %xmm3
+; SSE-NEXT: movdqa %xmm10, %xmm3
; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: pand %xmm7, %xmm2
+; SSE-NEXT: pand %xmm10, %xmm2
; SSE-NEXT: por %xmm2, %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: psrlq $48, %xmm11
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,3]
+; SSE-NEXT: movdqa %xmm6, %xmm1
+; SSE-NEXT: psrlq $48, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pandn %xmm2, %xmm1
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,3,2,3]
; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; SSE-NEXT: # xmm3 = mem[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
@@ -1497,24 +1504,24 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7]
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm8, %xmm1
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm7, %xmm1
; SSE-NEXT: psllq $48, %xmm1
-; SSE-NEXT: movdqa %xmm7, %xmm3
+; SSE-NEXT: movdqa %xmm10, %xmm3
; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: pand %xmm7, %xmm2
+; SSE-NEXT: pand %xmm10, %xmm2
; SSE-NEXT: por %xmm2, %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm14, %xmm1
+; SSE-NEXT: movdqa %xmm13, %xmm1
; SSE-NEXT: psrlq $48, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3]
-; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; SSE-NEXT: # xmm3 = mem[0,2,2,3]
+; SSE-NEXT: movdqa %xmm14, %xmm4
+; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
@@ -1523,111 +1530,112 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pandn %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pand %xmm7, %xmm0
+; SSE-NEXT: pand %xmm10, %xmm0
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; SSE-NEXT: movdqa %xmm6, %xmm1
; SSE-NEXT: psllq $48, %xmm1
-; SSE-NEXT: pandn %xmm1, %xmm7
-; SSE-NEXT: por %xmm0, %xmm7
-; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm10, %xmm7
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3]
+; SSE-NEXT: pandn %xmm1, %xmm10
+; SSE-NEXT: por %xmm0, %xmm10
+; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm9, %xmm0
+; SSE-NEXT: movdqa %xmm9, %xmm14
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
-; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT: movaps %xmm10, %xmm1
+; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT: movaps %xmm9, %xmm1
; SSE-NEXT: andnps %xmm0, %xmm1
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE-NEXT: pand %xmm10, %xmm2
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3]
+; SSE-NEXT: pand %xmm9, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,2,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm15[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0,1,3]
-; SSE-NEXT: movaps %xmm10, %xmm1
-; SSE-NEXT: andnps %xmm13, %xmm1
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
+; SSE-NEXT: movaps %xmm9, %xmm1
+; SSE-NEXT: andnps %xmm0, %xmm1
+; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[0,1,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
-; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
-; SSE-NEXT: pand %xmm10, %xmm2
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3]
+; SSE-NEXT: pand %xmm9, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm11, %xmm0
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3]
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
-; SSE-NEXT: movaps %xmm10, %xmm1
+; SSE-NEXT: movaps %xmm9, %xmm1
; SSE-NEXT: andnps %xmm0, %xmm1
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,1,3]
+; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[0,1,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
-; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
-; SSE-NEXT: pand %xmm10, %xmm2
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE-NEXT: pand %xmm9, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0,1,3]
-; SSE-NEXT: movaps %xmm10, %xmm1
-; SSE-NEXT: andnps %xmm5, %xmm1
-; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT: # xmm2 = mem[0,1,1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm12[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm12[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,1,3]
+; SSE-NEXT: movaps %xmm9, %xmm1
+; SSE-NEXT: andnps %xmm4, %xmm1
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3]
-; SSE-NEXT: pand %xmm10, %xmm2
+; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
+; SSE-NEXT: pand %xmm9, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7]
-; SSE-NEXT: movdqa %xmm10, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7]
+; SSE-NEXT: movdqa %xmm9, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; SSE-NEXT: movdqa %xmm4, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3]
+; SSE-NEXT: movdqa %xmm10, %xmm6
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm10, %xmm0
+; SSE-NEXT: pand %xmm9, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[3,0]
-; SSE-NEXT: movaps %xmm10, %xmm0
-; SSE-NEXT: andnps %xmm7, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm13[3,0]
+; SSE-NEXT: movaps %xmm9, %xmm0
+; SSE-NEXT: andnps %xmm13, %xmm0
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,2]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,7,4,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm14[0,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,4,6,7]
; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
@@ -1635,22 +1643,22 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7]
-; SSE-NEXT: movdqa %xmm10, %xmm1
+; SSE-NEXT: movdqa %xmm9, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm12, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[1,0,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm10, %xmm8
+; SSE-NEXT: pand %xmm9, %xmm8
; SSE-NEXT: por %xmm1, %xmm8
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[3,0]
-; SSE-NEXT: movdqa %xmm10, %xmm11
-; SSE-NEXT: pandn %xmm15, %xmm11
+; SSE-NEXT: movaps %xmm9, %xmm1
+; SSE-NEXT: andnps %xmm15, %xmm1
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,7,4,6,7]
; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
@@ -1658,120 +1666,121 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7]
-; SSE-NEXT: movdqa %xmm10, %xmm3
-; SSE-NEXT: pandn %xmm0, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,3,2,3,4,5,6,7]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,2,2,2,4,5,6,7]
+; SSE-NEXT: movdqa %xmm9, %xmm15
+; SSE-NEXT: pandn %xmm0, %xmm15
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
+; SSE-NEXT: movdqa %xmm5, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[1,0,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm10, %xmm15
-; SSE-NEXT: por %xmm3, %xmm15
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: movdqa %xmm6, %xmm1
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[3,0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2]
-; SSE-NEXT: movdqa %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[3,0]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2]
-; SSE-NEXT: movdqa %xmm7, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
-; SSE-NEXT: movaps %xmm3, %xmm5
-; SSE-NEXT: movdqa %xmm12, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm12[3,0]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[0,2]
-; SSE-NEXT: movaps %xmm3, %xmm6
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT: movaps %xmm9, %xmm7
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm13[3,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[1,0,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm9, %xmm5
+; SSE-NEXT: por %xmm15, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[1,1,1,1]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2]
+; SSE-NEXT: movaps %xmm6, %xmm3
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm11, %xmm6
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2]
+; SSE-NEXT: movdqa %xmm12, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[3,0]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2]
+; SSE-NEXT: movdqa %xmm7, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[3,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2]
+; SSE-NEXT: movaps %xmm1, %xmm2
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; SSE-NEXT: movaps %xmm10, %xmm1
+; SSE-NEXT: movdqa %xmm14, %xmm11
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm10, %xmm3
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[1,0,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm9, %xmm15
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm0[2,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm10, %xmm14
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pand %xmm9, %xmm14
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm10, %xmm0
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm5[2,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm10, %xmm12
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm10, %xmm5
-; SSE-NEXT: movdqa %xmm10, %xmm6
-; SSE-NEXT: movdqa %xmm10, %xmm1
-; SSE-NEXT: pandn %xmm13, %xmm10
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[0,2]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,7,4,6,7]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm7[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm4[2,0]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pandn %xmm4, %xmm6
-; SSE-NEXT: por %xmm6, %xmm3
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: movaps %xmm7, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0]
-; SSE-NEXT: pandn %xmm2, %xmm1
-; SSE-NEXT: movaps %xmm2, %xmm6
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,4,6,7]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,0,3]
+; SSE-NEXT: pand %xmm9, %xmm0
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm9, %xmm12
+; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm9, %xmm13
+; SSE-NEXT: movdqa %xmm9, %xmm6
+; SSE-NEXT: movdqa %xmm9, %xmm2
+; SSE-NEXT: pandn %xmm11, %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm1[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,0]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,2,2,2,4,5,6,7]
+; SSE-NEXT: pandn %xmm1, %xmm6
+; SSE-NEXT: por %xmm6, %xmm15
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; SSE-NEXT: movaps %xmm11, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: movaps %xmm3, %xmm6
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,7,4,6,7]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,0]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm3[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[2,0]
; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: # xmm4 = mem[0,2,2,3]
+; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT: # xmm3 = mem[0,2,2,3]
; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
; SSE-NEXT: # xmm6 = mem[0,1,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm3[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,0]
-; SSE-NEXT: por %xmm11, %xmm0
-; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: # xmm6 = mem[0,2,2,3]
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT: # xmm3 = mem[0,2,2,3]
; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
; SSE-NEXT: # xmm4 = mem[0,1,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm6[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
-; SSE-NEXT: por %xmm1, %xmm12
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
+; SSE-NEXT: por %xmm2, %xmm12
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[2,0]
-; SSE-NEXT: por %xmm5, %xmm10
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3]
+; SSE-NEXT: por %xmm13, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,1,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,0]
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 32(%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -1796,26 +1805,26 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: movaps %xmm1, 48(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 16(%rcx)
-; SSE-NEXT: movaps %xmm3, 32(%r8)
-; SSE-NEXT: movaps %xmm15, (%r8)
+; SSE-NEXT: movaps %xmm15, 32(%r8)
+; SSE-NEXT: movaps %xmm5, (%r8)
; SSE-NEXT: movaps %xmm8, 48(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 16(%r8)
-; SSE-NEXT: movaps %xmm10, (%r9)
+; SSE-NEXT: movaps %xmm9, (%r9)
; SSE-NEXT: movaps %xmm12, 32(%r9)
; SSE-NEXT: movaps %xmm0, 48(%r9)
; SSE-NEXT: movaps %xmm14, 16(%r9)
-; SSE-NEXT: addq $408, %rsp # imm = 0x198
+; SSE-NEXT: addq $424, %rsp # imm = 0x1A8
; SSE-NEXT: retq
;
; AVX1-LABEL: vf32:
; AVX1: # %bb.0:
; AVX1-NEXT: subq $424, %rsp # imm = 0x1A8
-; AVX1-NEXT: vmovdqa 304(%rdi), %xmm2
-; AVX1-NEXT: vmovdqa 288(%rdi), %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm2, %xmm13
+; AVX1-NEXT: vmovdqa 304(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovdqa 288(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa 256(%rdi), %xmm2
@@ -1847,263 +1856,265 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm3, %ymm12, %ymm3
+; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
+; AVX1-NEXT: vandps %ymm5, %ymm3, %ymm3
; AVX1-NEXT: vmovaps 224(%rdi), %xmm0
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1]
-; AVX1-NEXT: vandnps %ymm4, %ymm12, %ymm4
+; AVX1-NEXT: vandnps %ymm4, %ymm5, %ymm4
+; AVX1-NEXT: vmovaps %ymm5, %ymm9
; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa 144(%rdi), %xmm2
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa 128(%rdi), %xmm9
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3],xmm9[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovdqa 144(%rdi), %xmm12
+; AVX1-NEXT: vmovdqa 128(%rdi), %xmm15
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1],xmm12[2,3],xmm15[4,5,6,7]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; AVX1-NEXT: vmovdqa %xmm0, %xmm14
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
-; AVX1-NEXT: vmovdqa 112(%rdi), %xmm8
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1]
+; AVX1-NEXT: vmovdqa 96(%rdi), %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,1,3]
; AVX1-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa 80(%rdi), %xmm11
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
-; AVX1-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
+; AVX1-NEXT: vmovdqa 112(%rdi), %xmm5
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1]
+; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovdqa 80(%rdi), %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,2,2,3]
+; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm1[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0,1,2,3,4],xmm1[5,6,7]
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
-; AVX1-NEXT: vmovdqa %xmm2, %xmm10
+; AVX1-NEXT: vmovdqa %xmm2, %xmm7
; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,1,2,3,4,5,6,7]
; AVX1-NEXT: vmovdqa (%rdi), %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
-; AVX1-NEXT: vmovdqa %xmm1, %xmm5
+; AVX1-NEXT: vmovdqa %xmm1, %xmm13
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,1,0,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3],xmm2[4],xmm15[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7]
-; AVX1-NEXT: vandps %ymm4, %ymm12, %ymm7
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,1,0,3]
+; AVX1-NEXT: vmovdqa %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm2[4],xmm11[5,6,7]
+; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm11[4,5,6,7]
+; AVX1-NEXT: vmovaps %ymm9, %ymm0
+; AVX1-NEXT: vandps %ymm4, %ymm9, %ymm11
; AVX1-NEXT: vmovaps 64(%rdi), %xmm1
-; AVX1-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[0,1,0,1]
-; AVX1-NEXT: vmovaps %xmm1, %xmm4
-; AVX1-NEXT: vandnps %ymm15, %ymm12, %ymm15
-; AVX1-NEXT: vorps %ymm7, %ymm15, %ymm7
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = xmm1[0,1,0,1]
+; AVX1-NEXT: vmovaps %xmm1, %xmm14
+; AVX1-NEXT: vandnps %ymm9, %ymm0, %ymm9
+; AVX1-NEXT: vorps %ymm9, %ymm11, %ymm9
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa %xmm14, %xmm15
-; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm14[2,3],xmm8[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3,4,5,6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm3[4,5],xmm9[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm7
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm14
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm7[0],xmm14[0],xmm7[1],xmm14[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm6[2,3],xmm2[4,5],xmm6[6,7]
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7]
-; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm7
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3,4,5,6,7]
-; AVX1-NEXT: vandps %ymm7, %ymm12, %ymm7
-; AVX1-NEXT: vpsllq $48, %xmm4, %xmm14
-; AVX1-NEXT: vmovdqa %xmm4, %xmm8
-; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vandnps %ymm14, %ymm12, %ymm14
-; AVX1-NEXT: vorps %ymm7, %ymm14, %ymm7
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT: vmovdqa %xmm12, %xmm11
+; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm15, %xmm4
+; AVX1-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm12[4,5],xmm15[6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX1-NEXT: vpshufb %xmm5, %xmm9, %xmm9
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm9[5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpsrlq $48, %xmm7, %xmm8
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7]
+; AVX1-NEXT: vpshufb %xmm1, %xmm9, %xmm9
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4,5,6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
+; AVX1-NEXT: vandps %ymm3, %ymm8, %ymm8
+; AVX1-NEXT: vpsllq $48, %xmm14, %xmm9
+; AVX1-NEXT: vmovdqa %xmm14, %xmm2
+; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vandnps %ymm9, %ymm3, %ymm9
+; AVX1-NEXT: vorps %ymm9, %ymm8, %ymm8
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa %xmm13, %xmm14
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm13[4,5],xmm5[6,7]
-; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm14[4,5],xmm7[6,7]
+; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX1-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm8 = mem[0,1],xmm15[2,3],mem[4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,0,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3,4],xmm0[5,6,7]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1],xmm12[2,3],xmm13[4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,0,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm4 = xmm1[0,1],mem[2,3],xmm1[4,5],mem[6,7]
-; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm1
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1],xmm13[2,3],xmm12[4,5],xmm13[6,7]
+; AVX1-NEXT: vpshufb %xmm1, %xmm8, %xmm1
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm7
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5,6,7]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm7, %ymm1, %ymm1
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpsllq $48, %xmm4, %xmm4
-; AVX1-NEXT: vandnps %ymm4, %ymm7, %ymm4
-; AVX1-NEXT: vorps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm9
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4,5,6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
+; AVX1-NEXT: vandps %ymm1, %ymm9, %ymm1
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX1-NEXT: vpsllq $48, %xmm5, %xmm8
+; AVX1-NEXT: vandnps %ymm8, %ymm9, %ymm5
+; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm0 = mem[0,1,2,3],xmm15[4,5],mem[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13>
-; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm4 = mem[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw $12, (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm4 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm6[2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm1 = mem[0,1,1,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
-; AVX1-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,2,0]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13>
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm5 = mem[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm4[2,3],xmm11[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm5 = xmm4[0,1],mem[2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm5
+; AVX1-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm11 = mem[0,1,1,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7]
+; AVX1-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3,4,5],xmm11[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,1,2,0]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa %xmm5, %xmm15
-; AVX1-NEXT: vmovdqa %xmm14, %xmm8
-; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm5[2,3],xmm14[4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm14
-; AVX1-NEXT: vmovdqa %xmm12, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm12[4,5],xmm13[6,7]
-; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa %xmm11, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4],xmm14[5,6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm11[2,3],xmm7[4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,1,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,2,0]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1
+; AVX1-NEXT: vmovdqa %xmm7, %xmm11
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm7[2,3],xmm14[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm15[4,5],xmm0[6,7]
+; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4],xmm2[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,1,1,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
+; AVX1-NEXT: vmovdqa %xmm10, %xmm9
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4,5],xmm5[6,7]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,1,2,0]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm13[2,3],xmm5[4,5],xmm13[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15>
-; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm0
-; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm15[4,5],xmm8[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm10, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm11[4,5],xmm7[6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm0[2,3],xmm15[4,5],xmm0[6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15>
+; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm0
+; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm5
+; AVX1-NEXT: vmovdqa %xmm3, %xmm15
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3],xmm11[4,5],xmm14[6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm6[2,3],xmm9[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,4,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4,5],xmm5[6,7]
+; AVX1-NEXT: vmovdqa %xmm10, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7]
-; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm12, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm2[4,5],xmm4[6,7]
+; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm0[2,3],xmm8[4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7]
+; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX1-NEXT: vpsrlq $48, %xmm11, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1,2,3],xmm13[4,5],xmm11[6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm9
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm10[4,5],xmm12[6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,4,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[2,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,3,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,3,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3],xmm13[4,5],xmm14[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4,5],xmm4[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
+; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,1,3]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: vpshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: # xmm2 = mem[3,1,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3]
+; AVX1-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm4 = mem[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; AVX1-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
; AVX1-NEXT: # xmm4 = mem[0,3,2,3]
; AVX1-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
; AVX1-NEXT: # xmm4 = xmm4[0,1,2],mem[3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm5 = mem[2,3,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,3,2,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm4 = xmm6[0,1,2,3],mem[4,5],xmm6[6,7]
+; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm4 = mem[0,1,2,3],xmm6[4,5],mem[6,7]
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
; AVX1-NEXT: # xmm4 = mem[1,1,1,1]
@@ -2112,7 +2123,7 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,1,1,3]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
@@ -2120,17 +2131,17 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX1-NEXT: vmovaps %ymm3, (%rsi)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm3, 32(%rsi)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm3, 32(%rdx)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm3, (%rdx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm0, 32(%rcx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm0, (%rcx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm0, 32(%r8)
-; AVX1-NEXT: vmovaps %ymm9, (%r8)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm3, 32(%rcx)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm3, (%rcx)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm3, 32(%r8)
+; AVX1-NEXT: vmovaps %ymm0, (%r8)
; AVX1-NEXT: vmovaps %ymm2, 32(%r9)
; AVX1-NEXT: vmovaps %ymm1, (%r9)
; AVX1-NEXT: addq $424, %rsp # imm = 0x1A8
@@ -2140,408 +2151,414 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-SLOW-LABEL: vf32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: subq $280, %rsp # imm = 0x118
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm15
-; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm9
-; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3
-; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10
-; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm1
-; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15]
-; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8
-; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm7
-; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5],ymm3[6],ymm10[7,8],ymm3[9],ymm10[10,11],ymm3[12],ymm10[13],ymm3[14],ymm10[15]
-; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm13
-; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14
-; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm9[1,2],ymm15[3],ymm9[4],ymm15[5],ymm9[6,7],ymm15[8],ymm9[9,10],ymm15[11],ymm9[12],ymm15[13],ymm9[14,15]
-; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm11
-; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8
+; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3
+; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4
+; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5
+; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7
+; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15]
+; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6],ymm9[7]
-; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
-; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
-; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm0, %ymm9
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10],ymm13[11],ymm14[12,13],ymm13[14],ymm14[15]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm12
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5],ymm15[6],ymm11[7,8],ymm15[9],ymm11[10,11],ymm15[12],ymm11[13],ymm15[14],ymm11[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15]
-; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm13
-; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm14
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7]
-; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm7
-; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm4
-; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm8
-; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm4
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0],xmm7[1],xmm8[2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
-; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1
-; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm0
-; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm5
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5,6,7],ymm9[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
+; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm7[2],xmm8[3]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4,5],xmm10[6,7]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm9
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4],ymm8[5],ymm3[6,7],ymm8[8],ymm3[9,10],ymm8[11],ymm3[12],ymm8[13],ymm3[14,15]
+; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6],ymm13[7]
+; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1,2,3],xmm11[4,5],xmm13[6,7]
+; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm11
+; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm0, %ymm12
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
+; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
+; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13
+; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm13, %ymm0, %ymm14
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10,11],ymm8[12],ymm3[13],ymm8[14],ymm3[15]
+; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm11
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3],xmm0[4,5,6],xmm13[7]
+; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm15
+; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13
+; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3
+; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm13, %ymm11, %ymm10
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm3[0],xmm15[1],xmm3[2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2,3,4],ymm11[5,6,7],ymm9[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm13
+; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0],xmm13[1],xmm7[2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm15[2],xmm3[3]
+; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm12
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1,2,3,4],ymm5[5,6,7],ymm12[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm10
-; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5,6,7],ymm14[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm13[2],xmm7[3]
+; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7]
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm2[2],ymm12[3],ymm2[4],ymm12[5,6],ymm2[7],ymm12[8,9],ymm2[10],ymm12[11],ymm2[12],ymm12[13,14],ymm2[15]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
-; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
-; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm8[1],xmm7[2,3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm5[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6],ymm5[7]
-; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0,1],ymm13[2],ymm14[3],ymm13[4],ymm14[5,6],ymm13[7],ymm14[8,9],ymm13[10],ymm14[11],ymm13[12],ymm14[13,14],ymm13[15]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm4
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3,4],xmm6[5,6,7]
-; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15]
+; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm11
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6],ymm0[7]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0,1],ymm8[2],ymm5[3],ymm8[4],ymm5[5,6],ymm8[7],ymm5[8,9],ymm8[10],ymm5[11],ymm8[12],ymm5[13,14],ymm8[15]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
+; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0],xmm3[1],xmm15[2,3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4],ymm2[5],ymm12[6,7],ymm2[8],ymm12[9,10],ymm2[11],ymm12[12],ymm2[13],ymm12[14,15]
-; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm0
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
-; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm8[2],xmm7[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm5
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15]
-; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7]
-; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3]
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm3
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm9[2],xmm10[3]
-; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm5
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7]
+; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[3,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3],ymm15[4],ymm12[5,6],ymm15[7],ymm12[8,9],ymm15[10],ymm12[11],ymm15[12],ymm12[13,14],ymm15[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7]
-; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1],ymm6[2],ymm1[3],ymm6[4],ymm1[5,6],ymm6[7],ymm1[8,9],ymm6[10],ymm1[11],ymm6[12],ymm1[13,14],ymm6[15]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7]
+; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm7[1],xmm13[2,3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7,8],ymm2[9],ymm11[10],ymm2[11],ymm11[12,13],ymm2[14],ymm11[15]
+; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm7
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
+; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm12[2],xmm15[3]
+; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm11
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7]
+; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm9
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm1[1,2],ymm6[3],ymm1[4],ymm6[5],ymm1[6,7],ymm6[8],ymm1[9,10],ymm6[11],ymm1[12],ymm6[13],ymm1[14,15]
+; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3]
+; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm10
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm12[2],xmm13[3]
+; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7],ymm9[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm2 = ymm7[0,1],mem[2],ymm7[3],mem[4],ymm7[5,6],mem[7],ymm7[8,9],mem[10],ymm7[11],mem[12],ymm7[13,14],mem[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15]
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5],ymm4[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[3,1,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8)
+; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
+; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5],ymm3[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r8)
; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r9)
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r9)
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r9)
+; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9)
; AVX2-SLOW-NEXT: addq $280, %rsp # imm = 0x118
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-ALL-LABEL: vf32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: subq $200, %rsp
-; AVX2-FAST-ALL-NEXT: vmovdqa 224(%rdi), %ymm12
-; AVX2-FAST-ALL-NEXT: vmovdqa 256(%rdi), %ymm2
-; AVX2-FAST-ALL-NEXT: vmovdqa 192(%rdi), %ymm14
-; AVX2-FAST-ALL-NEXT: vmovdqa 160(%rdi), %ymm3
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-ALL-NEXT: subq $264, %rsp # imm = 0x108
+; AVX2-FAST-ALL-NEXT: vmovdqa 224(%rdi), %ymm0
+; AVX2-FAST-ALL-NEXT: vmovdqa 256(%rdi), %ymm1
+; AVX2-FAST-ALL-NEXT: vmovdqa 192(%rdi), %ymm3
+; AVX2-FAST-ALL-NEXT: vmovdqa 160(%rdi), %ymm14
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm2
; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm4
-; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm6
+; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm5
; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm7
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10],ymm5[11],ymm7[12],ymm5[13],ymm7[14,15]
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, %ymm6
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3]
; AVX2-FAST-ALL-NEXT: vpermd %ymm8, %ymm10, %ymm8
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm8, %ymm9
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5],ymm4[6],ymm0[7,8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13],ymm4[14],ymm0[15]
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, %ymm5
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm8, %xmm0
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3],xmm8[4,5],xmm0[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm13, %xmm0, %xmm0
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm0, %ymm9, %ymm9
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5],ymm14[6],ymm3[7,8],ymm14[9],ymm3[10,11],ymm14[12],ymm3[13],ymm14[14],ymm3[15]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm13, %xmm0, %xmm0
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm2[1,2],ymm12[3],ymm2[4],ymm12[5],ymm2[6,7],ymm12[8],ymm2[9,10],ymm12[11],ymm2[12],ymm12[13],ymm2[14,15]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm10, %ymm1
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm10
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,u,u,u,4,7,1,6>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15]
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, %ymm5
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm13, %xmm1
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3],xmm13[4,5,6],xmm1[7]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm13 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm13, %xmm1, %xmm1
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm3[1],ymm14[2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10],ymm3[11],ymm14[12,13],ymm3[14],ymm14[15]
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm15
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3],xmm1[4,5,6],xmm15[7]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm13, %xmm1, %xmm1
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10,11],ymm12[12],ymm2[13],ymm12[14],ymm2[15]
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm12, %ymm4
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm12
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,u,u,4,7,1,6>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm2, %ymm15
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm15
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm8, %xmm12
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm8, %xmm13
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm13, %ymm9, %ymm9
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15]
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm14, %ymm4
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm13, %xmm12
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm10, %ymm10
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm10, %ymm10
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm12, %ymm10, %ymm10
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,4,7,1,6>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm12, %ymm11
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm11, %ymm11
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm15[1],ymm5[2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10],ymm15[11],ymm5[12,13],ymm15[14],ymm5[15]
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm15, %ymm6
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm13, %xmm15
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm13, %xmm13
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm13, %ymm11, %ymm0
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, %ymm4
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm11, %xmm11
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-FAST-ALL-NEXT: vmovdqa 128(%rdi), %ymm13
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm15, %ymm11
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,1,3,0,3,5,7]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm1, %ymm11, %ymm1
-; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm2, %ymm8
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm12, %ymm12
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm11, %ymm12, %ymm12
+; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm14, %ymm8
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm8, %ymm8
; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vmovdqa 288(%rdi), %ymm11
-; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm2, %ymm2
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm2, %ymm2
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7],ymm10[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,2,3,1,3,6,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm2, %ymm9
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm9, %ymm9
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm14, %ymm8
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm8, %ymm8
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7],ymm10[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,2,3,1,3,6,7]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm9, %ymm10
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm1, %ymm10, %ymm10
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm2, %ymm0
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm9, %ymm0
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm8[2],ymm5[3],ymm8[4],ymm5[5,6],ymm8[7],ymm5[8,9],ymm8[10],ymm5[11],ymm8[12],ymm5[13,14],ymm8[15]
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, %ymm8
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, %ymm6
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm9
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3,4],xmm0[5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5],ymm7[6],ymm3[7,8],ymm7[9],ymm3[10,11],ymm7[12],ymm3[13],ymm7[14],ymm3[15]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = <0,2,u,u,5,7,2,4>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm9, %ymm14, %ymm9
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm2, %ymm9, %ymm9
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,4,6,0,1,4,6,0]
+; AVX2-FAST-ALL-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm9, %ymm10
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm10, %ymm10
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, %ymm5
+; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm0[2],ymm4[3],ymm0[4],ymm4[5,6],ymm0[7],ymm4[8,9],ymm0[10],ymm4[11],ymm0[12],ymm4[13,14],ymm0[15]
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm10, %xmm12
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3,4],xmm10[5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm10, %xmm1
+; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5],ymm12[6],ymm4[7,8],ymm12[9],ymm4[10,11],ymm12[12],ymm4[13],ymm12[14],ymm4[15]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm14, %ymm10
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm2, %ymm10, %ymm2
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm9, %ymm2
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm2, %ymm2
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4],ymm8[5],ymm6[6,7],ymm8[8],ymm6[9,10],ymm8[11],ymm6[12],ymm8[13],ymm6[14,15]
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm8, %ymm15
; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = <0,2,u,u,5,7,2,4>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm9, %ymm2
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm10, %ymm2
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm2, %ymm2
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm1, %xmm1
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,4,6,0,1,4,6,0]
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0]
; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm2, %ymm10
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm10, %ymm10
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm2, %ymm9
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm6, %ymm9, %ymm9
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm9[5,6,7],ymm1[8,9,10,11,12],ymm9[13,14,15]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm3[2],ymm14[3],ymm3[4],ymm14[5,6],ymm3[7],ymm14[8,9],ymm3[10],ymm14[11],ymm3[12],ymm14[13,14],ymm3[15]
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm10, %xmm1
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4],xmm10[5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5],ymm12[6],ymm4[7,8],ymm12[9],ymm4[10,11],ymm12[12],ymm4[13],ymm12[14],ymm4[15]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm9, %ymm1
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm2, %ymm1
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15]
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7>
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15]
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, %ymm9
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm0, %xmm0
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7,8],ymm4[9],ymm12[10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, %ymm14
; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm10, %ymm1
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21>
; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,4,7,0,2,4,7,0]
-; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm1, %ymm9
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm9, %ymm9
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm14[1,2],ymm3[3],ymm14[4],ymm3[5],ymm14[6,7],ymm3[8],ymm14[9,10],ymm3[11],ymm14[12],ymm3[13],ymm14[14,15]
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, %ymm9
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm15
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2],xmm0[3]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, %ymm15
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7,8],ymm4[9],ymm12[10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm10, %ymm2
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm2, %ymm2
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm6, %ymm1, %ymm1
; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7],ymm7[8,9],ymm6[10],ymm7[11],ymm6[12],ymm7[13,14],ymm6[15]
-; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm3[2],ymm7[3],ymm3[4],ymm7[5,6],ymm3[7],ymm7[8,9],ymm3[10],ymm7[11],ymm3[12],ymm7[13,14],ymm3[15]
+; AVX2-FAST-ALL-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15]
; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5>
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm1, %ymm5
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5],ymm9[6],ymm14[7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13],ymm9[14],ymm14[15]
-; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm4, %xmm5, %xmm4
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm15[2],ymm12[3],ymm15[4],ymm12[5,6],ymm15[7],ymm12[8,9],ymm15[10],ymm12[11],ymm15[12],ymm12[13,14],ymm15[15]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm1, %ymm6
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm6, %ymm6
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FAST-ALL-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX2-FAST-ALL-NEXT: # ymm3 = ymm9[0],mem[1],ymm9[2,3],mem[4],ymm9[5],mem[6],ymm9[7,8],mem[9],ymm9[10,11],mem[12],ymm9[13],mem[14],ymm9[15]
+; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm3, %xmm6
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm14[2],ymm12[3],ymm14[4],ymm12[5,6],ymm14[7],ymm12[8,9],ymm14[10],ymm12[11],ymm14[12],ymm12[13,14],ymm14[15]
; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm2, %ymm2
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5],ymm4[6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5],ymm3[6,7]
; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm1, %ymm1
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rsi)
@@ -2551,7 +2568,7 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rdx)
; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rdx)
-; AVX2-FAST-ALL-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rcx)
; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rcx)
@@ -2560,207 +2577,208 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%r8)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 32(%r9)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r9)
-; AVX2-FAST-ALL-NEXT: addq $200, %rsp
+; AVX2-FAST-ALL-NEXT: addq $264, %rsp # imm = 0x108
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX2-FAST-ALL-NEXT: retq
;
; AVX2-FAST-PERLANE-LABEL: vf32:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: subq $296, %rsp # imm = 0x128
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm15
+; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm9
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm4
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm11
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm6
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm7
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm10
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm12
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5],ymm2[6],ymm7[7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13],ymm2[14],ymm7[15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4,5],xmm10[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm12
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm7
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm6[1,2],ymm11[3],ymm6[4],ymm11[5],ymm6[6,7],ymm11[8],ymm6[9,10],ymm11[11],ymm6[12],ymm11[13],ymm6[14,15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6],ymm13[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm12, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5],ymm8[6],ymm15[7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13],ymm8[14],ymm15[15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1,2,3],xmm12[4,5],xmm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm6
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5],ymm10[6],ymm5[7,8],ymm10[9],ymm5[10,11],ymm10[12],ymm5[13],ymm10[14],ymm5[15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4,5],ymm11[6],ymm4[7,8],ymm11[9],ymm4[10],ymm11[11],ymm4[12,13],ymm11[14],ymm4[15]
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm1, %ymm13
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm15[1],ymm8[2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7,8],ymm15[9],ymm8[10],ymm15[11],ymm8[12,13],ymm15[14],ymm8[15]
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm10
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm12
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm1, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm10[1],xmm12[2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm1
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm12, %xmm10
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm9, %ymm12
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm9
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7,8],ymm2[9],ymm14[10],ymm2[11],ymm14[12,13],ymm2[14],ymm14[15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm3
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm13
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm13, %ymm9, %ymm13
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm6
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5],ymm11[6],ymm15[7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13],ymm11[14],ymm15[15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm15
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6],xmm10[7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm9
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm10, %xmm14
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm10
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm14, %ymm15, %ymm14
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0],xmm9[1],xmm10[2,3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm8
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm9
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0],xmm8[1],xmm9[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm5
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7],ymm6[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7],ymm7[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm10[2],xmm12[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm5
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7],ymm13[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1],xmm8[2],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm5
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5],ymm0[6],ymm13[7,8],ymm0[9],ymm13[10,11],ymm0[12],ymm13[13],ymm0[14],ymm13[15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm11
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm9[2],xmm10[3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7],ymm14[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm14[2],ymm1[3],ymm14[4],ymm1[5,6],ymm14[7],ymm1[8,9],ymm14[10],ymm1[11],ymm14[12],ymm1[13,14],ymm14[15]
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0],xmm12[1],xmm10[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10,11],ymm3[12],ymm7[13],ymm3[14],ymm7[15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15]
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0],xmm9[1],xmm8[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm1[1,2],ymm14[3],ymm1[4],ymm14[5],ymm1[6,7],ymm14[8],ymm1[9,10],ymm14[11],ymm1[12],ymm14[13],ymm1[14,15]
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2],xmm6[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm12[2],xmm10[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm14
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm7[1,2],ymm15[3],ymm7[4],ymm15[5],ymm7[6,7],ymm15[8],ymm7[9,10],ymm15[11],ymm7[12],ymm15[13],ymm7[14,15]
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2],xmm6[3]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm9[2],xmm8[3]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm8, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm1
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6],ymm11[7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm12
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0],xmm10[1],xmm9[2,3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm12[2,3,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1],ymm5[2],mem[3],ymm5[4],mem[5,6],ymm5[7],mem[8,9],ymm5[10],mem[11],ymm5[12],mem[13,14],ymm5[15]
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm12
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4],ymm7[5],ymm3[6,7],ymm7[8],ymm3[9,10],ymm7[11],ymm3[12],ymm7[13],ymm3[14,15]
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm12
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1],xmm10[2],xmm9[3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm12[2,3,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm14, %xmm13
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm6[2],ymm8[3],ymm6[4],ymm8[5,6],ymm6[7],ymm8[8,9],ymm6[10],ymm8[11],ymm6[12],ymm8[13,14],ymm6[15]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm14[1],ymm7[2,3],ymm14[4],ymm7[5],ymm14[6],ymm7[7,8],ymm14[9],ymm7[10,11],ymm14[12],ymm7[13],ymm14[14],ymm7[15]
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm6
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5],ymm6[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15]
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5],ymm4[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm2
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6],ymm3[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
+; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5],ymm3[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
@@ -2776,12 +2794,11 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx)
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r8)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, (%r8)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 32(%r8)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, (%r8)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r9)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9)
-; AVX2-FAST-PERLANE-NEXT: addq $296, %rsp # imm = 0x128
+; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index 2c210724a72e1..65aee3b3a5ce7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -172,52 +172,52 @@ define void @vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7]
; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm6, %xmm2
-; SSE-NEXT: pandn %xmm5, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm2, %xmm6
+; SSE-NEXT: pandn %xmm5, %xmm6
; SSE-NEXT: movdqa %xmm1, %xmm7
; SSE-NEXT: psrld $16, %xmm7
; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; SSE-NEXT: pand %xmm6, %xmm4
-; SSE-NEXT: por %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,2,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,3,2,3]
+; SSE-NEXT: pand %xmm2, %xmm4
+; SSE-NEXT: por %xmm6, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3]
; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: movdqa %xmm6, %xmm2
-; SSE-NEXT: pandn %xmm5, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm8
+; SSE-NEXT: pandn %xmm5, %xmm8
; SSE-NEXT: movdqa %xmm0, %xmm5
; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0]
; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7]
-; SSE-NEXT: pand %xmm6, %xmm7
-; SSE-NEXT: por %xmm2, %xmm7
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm2, %xmm9
+; SSE-NEXT: por %xmm8, %xmm9
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm6, %xmm3
-; SSE-NEXT: pandn %xmm8, %xmm6
-; SSE-NEXT: por %xmm3, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm2, %xmm5
+; SSE-NEXT: pandn %xmm6, %xmm2
+; SSE-NEXT: por %xmm5, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; SSE-NEXT: psrlq $48, %xmm1
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movq %xmm2, (%rsi)
+; SSE-NEXT: movq %xmm3, (%rsi)
; SSE-NEXT: movq %xmm4, (%rdx)
-; SSE-NEXT: movq %xmm7, (%rcx)
-; SSE-NEXT: movq %xmm6, (%r8)
-; SSE-NEXT: movq %xmm5, (%r9)
+; SSE-NEXT: movq %xmm9, (%rcx)
+; SSE-NEXT: movq %xmm2, (%r8)
+; SSE-NEXT: movq %xmm6, (%r9)
; SSE-NEXT: movq %xmm0, (%rax)
; SSE-NEXT: retq
;
@@ -232,7 +232,7 @@ define void @vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX1-NEXT: vpsrld $16, %xmm1, %xmm5
; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
@@ -245,21 +245,21 @@ define void @vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,14,15,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm1
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,3,2,3,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm8, (%rsi)
+; AVX1-NEXT: vmovq %xmm4, (%rsi)
; AVX1-NEXT: vmovq %xmm3, (%rdx)
; AVX1-NEXT: vmovq %xmm5, (%rcx)
; AVX1-NEXT: vmovq %xmm6, (%r8)
-; AVX1-NEXT: vmovq %xmm4, (%r9)
+; AVX1-NEXT: vmovq %xmm7, (%r9)
; AVX1-NEXT: vmovq %xmm0, (%rax)
; AVX1-NEXT: retq
;
@@ -370,124 +370,124 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; SSE-LABEL: vf8:
; SSE: # %bb.0:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movdqa 64(%rdi), %xmm6
-; SSE-NEXT: movdqa 80(%rdi), %xmm10
-; SSE-NEXT: movdqa (%rdi), %xmm13
-; SSE-NEXT: movdqa 16(%rdi), %xmm14
-; SSE-NEXT: movdqa 32(%rdi), %xmm9
-; SSE-NEXT: movdqa 48(%rdi), %xmm11
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,1,2,4,5,6,7]
+; SSE-NEXT: movdqa 64(%rdi), %xmm0
+; SSE-NEXT: movdqa 80(%rdi), %xmm7
+; SSE-NEXT: movdqa (%rdi), %xmm2
+; SSE-NEXT: movdqa 16(%rdi), %xmm5
+; SSE-NEXT: movdqa 32(%rdi), %xmm6
+; SSE-NEXT: movdqa 48(%rdi), %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,2,4,5,6,7]
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pandn %xmm0, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,6,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3]
-; SSE-NEXT: pand %xmm1, %xmm12
-; SSE-NEXT: por %xmm4, %xmm12
-; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,2,3,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
-; SSE-NEXT: movdqa %xmm10, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[3,0]
-; SSE-NEXT: movaps %xmm6, %xmm15
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm10[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[2,3]
-; SSE-NEXT: pslld $16, %xmm10
-; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm5[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0]
-; SSE-NEXT: movdqa %xmm14, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE-NEXT: pand %xmm1, %xmm5
-; SSE-NEXT: pandn %xmm9, %xmm1
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,1,0,3]
+; SSE-NEXT: movdqa %xmm1, %xmm9
+; SSE-NEXT: pandn %xmm3, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,6,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: por %xmm9, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,3,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
+; SSE-NEXT: movdqa %xmm7, %xmm12
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0]
+; SSE-NEXT: movaps %xmm0, %xmm11
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,3]
+; SSE-NEXT: pslld $16, %xmm7
; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0]
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm0, %xmm7
-; SSE-NEXT: pandn %xmm9, %xmm7
-; SSE-NEXT: movdqa %xmm13, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm14[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm14[2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7]
-; SSE-NEXT: pand %xmm0, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,0]
-; SSE-NEXT: pand %xmm7, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[0,2]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,6,5,4]
-; SSE-NEXT: movdqa %xmm7, %xmm3
-; SSE-NEXT: pandn %xmm9, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: movdqa %xmm11, %xmm5
+; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm7[0,1,0,2,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm13[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0]
+; SSE-NEXT: movdqa %xmm5, %xmm9
+; SSE-NEXT: psrld $16, %xmm9
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; SSE-NEXT: pand %xmm1, %xmm8
+; SSE-NEXT: pandn %xmm6, %xmm1
+; SSE-NEXT: por %xmm8, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm7[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3]
+; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm7, %xmm10
+; SSE-NEXT: pandn %xmm6, %xmm10
+; SSE-NEXT: movdqa %xmm2, %xmm6
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm6[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[1,0,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm7, %xmm13
+; SSE-NEXT: por %xmm10, %xmm13
+; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
+; SSE-NEXT: pand %xmm10, %xmm13
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm12[0,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4]
+; SSE-NEXT: movdqa %xmm10, %xmm14
+; SSE-NEXT: pandn %xmm12, %xmm14
+; SSE-NEXT: por %xmm13, %xmm14
+; SSE-NEXT: movdqa %xmm4, %xmm12
+; SSE-NEXT: psrlq $48, %xmm12
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm12[0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm7, %xmm6
+; SSE-NEXT: pandn %xmm8, %xmm7
+; SSE-NEXT: por %xmm6, %xmm7
+; SSE-NEXT: pand %xmm10, %xmm7
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
+; SSE-NEXT: movdqa %xmm10, %xmm8
+; SSE-NEXT: pandn %xmm6, %xmm8
+; SSE-NEXT: por %xmm7, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,4,6]
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm4[1]
+; SSE-NEXT: movss {{.*#+}} xmm6 = xmm7[0],xmm6[1,2,3]
+; SSE-NEXT: andps %xmm10, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,6]
+; SSE-NEXT: movdqa %xmm10, %xmm11
+; SSE-NEXT: pandn %xmm7, %xmm11
+; SSE-NEXT: por %xmm6, %xmm11
; SSE-NEXT: psrlq $48, %xmm5
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm2, %xmm0
-; SSE-NEXT: por %xmm4, %xmm0
-; SSE-NEXT: pand %xmm7, %xmm0
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
-; SSE-NEXT: movdqa %xmm7, %xmm4
-; SSE-NEXT: pandn %xmm2, %xmm4
-; SSE-NEXT: por %xmm0, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,4,6]
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm11[1]
-; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; SSE-NEXT: andps %xmm7, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
-; SSE-NEXT: movdqa %xmm7, %xmm5
-; SSE-NEXT: pandn %xmm2, %xmm5
-; SSE-NEXT: por %xmm0, %xmm5
-; SSE-NEXT: psrlq $48, %xmm14
-; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; SSE-NEXT: psrld $16, %xmm11
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7]
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm11[1]
-; SSE-NEXT: movss {{.*#+}} xmm0 = xmm13[0],xmm0[1,2,3]
-; SSE-NEXT: andps %xmm7, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
-; SSE-NEXT: pandn %xmm2, %xmm7
-; SSE-NEXT: por %xmm0, %xmm7
-; SSE-NEXT: movaps %xmm12, (%rsi)
+; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE-NEXT: psrld $16, %xmm4
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,5,5,7]
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1]
+; SSE-NEXT: movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3]
+; SSE-NEXT: andps %xmm10, %xmm5
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7]
+; SSE-NEXT: pandn %xmm0, %xmm10
+; SSE-NEXT: por %xmm5, %xmm10
+; SSE-NEXT: movaps %xmm3, (%rsi)
; SSE-NEXT: movaps %xmm1, (%rdx)
-; SSE-NEXT: movdqa %xmm3, (%rcx)
-; SSE-NEXT: movdqa %xmm4, (%r8)
-; SSE-NEXT: movdqa %xmm5, (%r9)
-; SSE-NEXT: movdqa %xmm7, (%rax)
+; SSE-NEXT: movdqa %xmm14, (%rcx)
+; SSE-NEXT: movdqa %xmm8, (%r8)
+; SSE-NEXT: movdqa %xmm11, (%r9)
+; SSE-NEXT: movdqa %xmm10, (%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: vf8:
; AVX1: # %bb.0:
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: vmovdqa (%rdi), %xmm8
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm4
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1
@@ -495,67 +495,67 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,6,6,7]
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2],xmm3[3,4,5],xmm5[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3,4,5],xmm5[6,7]
; AVX1-NEXT: vmovdqa 80(%rdi), %xmm5
-; AVX1-NEXT: vpslld $16, %xmm5, %xmm10
+; AVX1-NEXT: vpslld $16, %xmm5, %xmm9
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm6
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm3[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,1,1]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm10 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[1,1,1,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
-; AVX1-NEXT: vpsrld $16, %xmm2, %xmm7
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5],xmm0[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,3,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1,2],xmm0[3,4],xmm7[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3],xmm5[4,5],xmm6[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm0[5,6,7]
-; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm12
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,3,3]
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm12[0]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,3,2,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm9
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm9 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4],xmm10[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm5[4,5],xmm6[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm11[5,6,7]
+; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm11
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[2,2,3,3]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm12[0],xmm11[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,14,15,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm11[3,4],xmm9[5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,4,6]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm1[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,4,6]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
; AVX1-NEXT: vpsrlq $48, %xmm2, %xmm2
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,7]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm4[1],xmm1[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX1-NEXT: vmovdqa %xmm9, (%rsi)
-; AVX1-NEXT: vmovdqa %xmm10, (%rdx)
-; AVX1-NEXT: vmovdqa %xmm11, (%rcx)
-; AVX1-NEXT: vmovdqa %xmm0, (%r8)
-; AVX1-NEXT: vmovdqa %xmm3, (%r9)
-; AVX1-NEXT: vmovdqa %xmm1, (%rax)
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX1-NEXT: vmovdqa %xmm3, (%rsi)
+; AVX1-NEXT: vmovdqa %xmm7, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm8, (%rcx)
+; AVX1-NEXT: vmovdqa %xmm9, (%r8)
+; AVX1-NEXT: vmovdqa %xmm6, (%r9)
+; AVX1-NEXT: vmovdqa %xmm0, (%rax)
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vf8:
@@ -565,58 +565,58 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX2-SLOW-NEXT: vpslld $16, %xmm0, %xmm2
-; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,0,3]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3],xmm1[4,5],xmm6[6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT: vpbroadcastw 74(%rdi), %xmm1
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
+; AVX2-SLOW-NEXT: vpbroadcastw 74(%rdi), %xmm6
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm0[2],xmm8[3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3],xmm1[4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[2,2,2,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa %xmm9, (%rsi)
-; AVX2-SLOW-NEXT: vmovdqa %xmm10, (%rdx)
-; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%r8)
-; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%r9)
+; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rdx)
+; AVX2-SLOW-NEXT: vmovdqa %xmm8, (%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%r8)
+; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%r9)
; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rax)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -628,151 +628,151 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX2-FAST-NEXT: vpslld $16, %xmm0, %xmm3
-; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm10
-; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3],xmm3[4,5],xmm6[6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0,1,2],xmm8[3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3]
; AVX2-FAST-NEXT: vpbroadcastw 74(%rdi), %xmm6
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1,2],xmm6[3]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[2,1,2,0,4,5,6,7]
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm0[2],xmm10[3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7]
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,1,4,5,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[2,2,2,2,4,5,6,7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7]
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7]
-; AVX2-FAST-NEXT: vmovdqa %xmm8, (%rsi)
-; AVX2-FAST-NEXT: vmovdqa %xmm9, (%rdx)
-; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rcx)
-; AVX2-FAST-NEXT: vmovdqa %xmm3, (%r8)
-; AVX2-FAST-NEXT: vmovdqa %xmm5, (%r9)
+; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsi)
+; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rdx)
+; AVX2-FAST-NEXT: vmovdqa %xmm8, (%rcx)
+; AVX2-FAST-NEXT: vmovdqa %xmm6, (%r8)
+; AVX2-FAST-NEXT: vmovdqa %xmm1, (%r9)
; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rax)
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: vf8:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <3,9,15,u,u,u,u,u>
; AVX512-NEXT: vmovdqu64 (%rdi), %zmm1
-; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm8
+; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm4
; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <2,8,14,u,u,u,u,u>
-; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm9
+; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm5
; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <1,7,13,u,u,u,u,u>
; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm6
; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <0,6,12,u,u,u,u,u>
; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vmovdqa (%rdi), %xmm7
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm8
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX512-NEXT: vpextrw $6, %xmm0, %eax
-; AVX512-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
+; AVX512-NEXT: vpextrw $2, %xmm1, %r10d
+; AVX512-NEXT: vpinsrw $3, %r10d, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %xmm0, %r10d
+; AVX512-NEXT: vpinsrw $4, %r10d, %xmm2, %xmm2
+; AVX512-NEXT: vpextrw $6, %xmm0, %r10d
+; AVX512-NEXT: vpinsrw $5, %r10d, %xmm2, %xmm2
; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512-NEXT: vpextrw $4, %xmm3, %eax
-; AVX512-NEXT: vpinsrw $6, %eax, %xmm2, %xmm5
+; AVX512-NEXT: vpextrw $4, %xmm3, %r10d
+; AVX512-NEXT: vpinsrw $6, %r10d, %xmm2, %xmm9
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512-NEXT: vpextrw $2, %xmm2, %eax
-; AVX512-NEXT: vpinsrw $7, %eax, %xmm5, %xmm10
+; AVX512-NEXT: vpextrw $2, %xmm2, %edi
+; AVX512-NEXT: vpinsrw $7, %edi, %xmm9, %xmm9
; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm1[3],xmm6[4,5,6,7]
-; AVX512-NEXT: vpextrw $1, %xmm0, %eax
-; AVX512-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrw $7, %xmm0, %eax
-; AVX512-NEXT: vpinsrw $5, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrw $5, %xmm3, %eax
-; AVX512-NEXT: vpinsrw $6, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrw $3, %xmm2, %eax
-; AVX512-NEXT: vpinsrw $7, %eax, %xmm6, %xmm11
-; AVX512-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm9, %xmm5
-; AVX512-NEXT: vpextrw $2, %xmm0, %eax
-; AVX512-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
-; AVX512-NEXT: vmovd %xmm3, %eax
-; AVX512-NEXT: vpinsrw $5, %eax, %xmm5, %xmm5
+; AVX512-NEXT: vpextrw $1, %xmm0, %edi
+; AVX512-NEXT: vpinsrw $4, %edi, %xmm6, %xmm6
+; AVX512-NEXT: vpextrw $7, %xmm0, %edi
+; AVX512-NEXT: vpinsrw $5, %edi, %xmm6, %xmm6
+; AVX512-NEXT: vpextrw $5, %xmm3, %edi
+; AVX512-NEXT: vpinsrw $6, %edi, %xmm6, %xmm6
+; AVX512-NEXT: vpextrw $3, %xmm2, %edi
+; AVX512-NEXT: vpinsrw $7, %edi, %xmm6, %xmm6
+; AVX512-NEXT: vpextrw $4, %xmm1, %edi
+; AVX512-NEXT: vpinsrw $3, %edi, %xmm5, %xmm5
+; AVX512-NEXT: vpextrw $2, %xmm0, %edi
+; AVX512-NEXT: vpinsrw $4, %edi, %xmm5, %xmm5
+; AVX512-NEXT: vmovd %xmm3, %edi
+; AVX512-NEXT: vpinsrw $5, %edi, %xmm5, %xmm5
; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7]
-; AVX512-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512-NEXT: vpinsrw $7, %eax, %xmm5, %xmm9
-; AVX512-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm8, %xmm6
-; AVX512-NEXT: vpextrw $3, %xmm0, %eax
-; AVX512-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrw $1, %xmm3, %eax
-; AVX512-NEXT: vpinsrw $5, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512-NEXT: vpinsrw $6, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrw $5, %xmm2, %eax
-; AVX512-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrw $2, %xmm4, %eax
-; AVX512-NEXT: vpextrw $4, %xmm7, %edi
-; AVX512-NEXT: vmovd %edi, %xmm5
-; AVX512-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
-; AVX512-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm5, %xmm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm0[4],xmm5[5,6,7]
-; AVX512-NEXT: vpextrw $2, %xmm3, %eax
-; AVX512-NEXT: vpinsrw $5, %eax, %xmm5, %xmm5
-; AVX512-NEXT: vmovd %xmm2, %eax
-; AVX512-NEXT: vpinsrw $6, %eax, %xmm5, %xmm5
-; AVX512-NEXT: vpextrw $6, %xmm2, %eax
-; AVX512-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
-; AVX512-NEXT: vpextrw $3, %xmm4, %eax
-; AVX512-NEXT: vpextrw $5, %xmm7, %edi
-; AVX512-NEXT: vmovd %edi, %xmm4
-; AVX512-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
-; AVX512-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; AVX512-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm4, %xmm1
-; AVX512-NEXT: vpextrw $5, %xmm0, %eax
-; AVX512-NEXT: vpinsrw $4, %eax, %xmm1, %xmm0
-; AVX512-NEXT: vpextrw $3, %xmm3, %eax
-; AVX512-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpextrw $1, %xmm2, %eax
-; AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vpextrw $4, %xmm2, %edi
+; AVX512-NEXT: vpinsrw $7, %edi, %xmm5, %xmm5
+; AVX512-NEXT: vpextrw $5, %xmm1, %edi
+; AVX512-NEXT: vpinsrw $3, %edi, %xmm4, %xmm4
+; AVX512-NEXT: vpextrw $3, %xmm0, %edi
+; AVX512-NEXT: vpinsrw $4, %edi, %xmm4, %xmm4
+; AVX512-NEXT: vpextrw $1, %xmm3, %edi
+; AVX512-NEXT: vpinsrw $5, %edi, %xmm4, %xmm4
+; AVX512-NEXT: vpextrw $7, %xmm3, %edi
+; AVX512-NEXT: vpinsrw $6, %edi, %xmm4, %xmm4
+; AVX512-NEXT: vpextrw $5, %xmm2, %edi
+; AVX512-NEXT: vpinsrw $7, %edi, %xmm4, %xmm4
+; AVX512-NEXT: vpextrw $2, %xmm8, %edi
+; AVX512-NEXT: vpextrw $4, %xmm7, %r10d
+; AVX512-NEXT: vmovd %r10d, %xmm10
+; AVX512-NEXT: vpinsrw $1, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vmovd %xmm1, %edi
+; AVX512-NEXT: vpinsrw $2, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrw $6, %xmm1, %edi
+; AVX512-NEXT: vpinsrw $3, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6,7]
+; AVX512-NEXT: vpextrw $2, %xmm3, %edi
+; AVX512-NEXT: vpinsrw $5, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vmovd %xmm2, %edi
+; AVX512-NEXT: vpinsrw $6, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrw $6, %xmm2, %edi
+; AVX512-NEXT: vpinsrw $7, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrw $3, %xmm8, %edi
+; AVX512-NEXT: vpextrw $5, %xmm7, %r10d
+; AVX512-NEXT: vmovd %r10d, %xmm7
+; AVX512-NEXT: vpinsrw $1, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrw $1, %xmm1, %edi
+; AVX512-NEXT: vpinsrw $2, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrw $7, %xmm1, %edi
+; AVX512-NEXT: vpinsrw $3, %edi, %xmm7, %xmm1
+; AVX512-NEXT: vpextrw $5, %xmm0, %edi
+; AVX512-NEXT: vpinsrw $4, %edi, %xmm1, %xmm0
+; AVX512-NEXT: vpextrw $3, %xmm3, %edi
+; AVX512-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
+; AVX512-NEXT: vpextrw $1, %xmm2, %edi
+; AVX512-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512-NEXT: vmovdqa %xmm10, (%rsi)
-; AVX512-NEXT: vmovdqa %xmm11, (%rdx)
-; AVX512-NEXT: vmovdqa %xmm9, (%rcx)
-; AVX512-NEXT: vmovdqa %xmm6, (%r8)
-; AVX512-NEXT: vmovdqa %xmm5, (%r9)
-; AVX512-NEXT: vmovdqa %xmm0, (%r10)
+; AVX512-NEXT: vmovdqa %xmm9, (%rsi)
+; AVX512-NEXT: vmovdqa %xmm6, (%rdx)
+; AVX512-NEXT: vmovdqa %xmm5, (%rcx)
+; AVX512-NEXT: vmovdqa %xmm4, (%r8)
+; AVX512-NEXT: vmovdqa %xmm10, (%r9)
+; AVX512-NEXT: vmovdqa %xmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%wide.vec = load <48 x i16>, ptr %in.vec, align 32
@@ -797,630 +797,614 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
; SSE-LABEL: vf16:
; SSE: # %bb.0:
-; SSE-NEXT: subq $72, %rsp
-; SSE-NEXT: movdqa 112(%rdi), %xmm4
-; SSE-NEXT: movdqa 128(%rdi), %xmm11
+; SSE-NEXT: subq $104, %rsp
+; SSE-NEXT: movdqa 112(%rdi), %xmm8
+; SSE-NEXT: movdqa 128(%rdi), %xmm12
; SSE-NEXT: movdqa 64(%rdi), %xmm2
-; SSE-NEXT: movdqa 80(%rdi), %xmm12
+; SSE-NEXT: movdqa 80(%rdi), %xmm10
; SSE-NEXT: movdqa (%rdi), %xmm3
-; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rdi), %xmm15
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 16(%rdi), %xmm5
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 32(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 48(%rdi), %xmm5
+; SSE-NEXT: movdqa 48(%rdi), %xmm11
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT: movdqa %xmm7, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT: movdqa %xmm13, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE-NEXT: pand %xmm7, %xmm0
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE-NEXT: pand %xmm13, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
-; SSE-NEXT: movdqa %xmm12, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm10, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm12[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm12[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[2,3]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pslld $16, %xmm12
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
-; SSE-NEXT: movdqa %xmm5, %xmm12
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,1,0,2,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
+; SSE-NEXT: pslld $16, %xmm10
+; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,1,0,2,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,1,2,4,5,6,7]
-; SSE-NEXT: movdqa %xmm7, %xmm5
-; SSE-NEXT: movdqa %xmm7, %xmm2
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,1,2,4,5,6,7]
+; SSE-NEXT: movdqa %xmm13, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa 96(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,6,6,7]
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE-NEXT: pand %xmm5, %xmm8
-; SSE-NEXT: por %xmm2, %xmm8
-; SSE-NEXT: movdqa 160(%rdi), %xmm3
-; SSE-NEXT: movdqa 176(%rdi), %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3]
-; SSE-NEXT: pslld $16, %xmm2
-; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE-NEXT: movdqa 144(%rdi), %xmm13
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm9[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0]
-; SSE-NEXT: movdqa %xmm15, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,6,6,7]
+; SSE-NEXT: movdqa %xmm8, %xmm3
+; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
+; SSE-NEXT: pand %xmm13, %xmm10
+; SSE-NEXT: por %xmm2, %xmm10
+; SSE-NEXT: movdqa 160(%rdi), %xmm4
+; SSE-NEXT: movdqa 176(%rdi), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0]
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm4, %xmm15
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3]
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; SSE-NEXT: movdqa 144(%rdi), %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm9[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm5, %xmm15
+; SSE-NEXT: psrld $16, %xmm15
; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
; SSE-NEXT: # xmm9 = mem[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3]
-; SSE-NEXT: movdqa %xmm5, %xmm1
+; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm15[2],xmm9[3],xmm15[3]
+; SSE-NEXT: movdqa %xmm13, %xmm15
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT: pandn %xmm14, %xmm1
-; SSE-NEXT: pand %xmm5, %xmm9
-; SSE-NEXT: por %xmm1, %xmm9
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,1,1,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm1[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm6[2,0]
-; SSE-NEXT: psrld $16, %xmm4
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE-NEXT: pand %xmm5, %xmm7
-; SSE-NEXT: pandn %xmm11, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0]
-; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pandn %xmm14, %xmm15
+; SSE-NEXT: pand %xmm13, %xmm9
+; SSE-NEXT: por %xmm15, %xmm9
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm7[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm15[2,0]
+; SSE-NEXT: movdqa %xmm3, %xmm7
+; SSE-NEXT: psrld $16, %xmm7
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE-NEXT: pand %xmm13, %xmm6
+; SSE-NEXT: movdqa %xmm12, %xmm15
+; SSE-NEXT: pandn %xmm12, %xmm13
+; SSE-NEXT: por %xmm6, %xmm13
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm2[2,0]
+; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm14, %xmm1
; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
+; SSE-NEXT: movdqa %xmm11, %xmm13
; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm0, %xmm10
-; SSE-NEXT: pandn %xmm1, %xmm10
-; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm15[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[1,0,2,3,4,5,6,7]
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: por %xmm10, %xmm2
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,2]
-; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,6,5,4]
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,0]
-; SSE-NEXT: movdqa %xmm7, %xmm4
-; SSE-NEXT: pandn %xmm10, %xmm4
-; SSE-NEXT: pand %xmm7, %xmm2
-; SSE-NEXT: por %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm11, %xmm2
-; SSE-NEXT: movdqa %xmm11, %xmm14
-; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,1,1,1]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm2, %xmm4
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm2[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7]
-; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: por %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pandn %xmm1, %xmm6
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm5[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: por %xmm6, %xmm1
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,2]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
-; SSE-NEXT: movdqa %xmm7, %xmm5
-; SSE-NEXT: pandn %xmm4, %xmm5
-; SSE-NEXT: pand %xmm7, %xmm6
-; SSE-NEXT: por %xmm6, %xmm5
-; SSE-NEXT: movdqa %xmm12, %xmm4
-; SSE-NEXT: psrlq $48, %xmm4
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm6, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
+; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0]
+; SSE-NEXT: movdqa %xmm12, %xmm5
+; SSE-NEXT: pandn %xmm6, %xmm5
+; SSE-NEXT: pand %xmm12, %xmm1
+; SSE-NEXT: por %xmm1, %xmm5
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm15, %xmm1
+; SSE-NEXT: movdqa %xmm15, %xmm2
+; SSE-NEXT: movdqa %xmm15, (%rsp) # 16-byte Spill
+; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pandn %xmm1, %xmm6
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm3[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm3[2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: por %xmm4, %xmm1
-; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
-; SSE-NEXT: movdqa %xmm7, %xmm11
-; SSE-NEXT: pandn %xmm4, %xmm11
-; SSE-NEXT: pand %xmm7, %xmm1
+; SSE-NEXT: por %xmm6, %xmm1
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT: # xmm3 = xmm3[0,1],mem[0,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,5,4]
+; SSE-NEXT: movdqa %xmm12, %xmm11
+; SSE-NEXT: pandn %xmm6, %xmm11
+; SSE-NEXT: pand %xmm12, %xmm1
; SSE-NEXT: por %xmm1, %xmm11
; SSE-NEXT: movdqa %xmm13, %xmm1
; SSE-NEXT: psrlq $48, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,3,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0]
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pandn %xmm6, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm0, %xmm6
+; SSE-NEXT: por %xmm1, %xmm6
+; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; SSE-NEXT: movdqa %xmm12, %xmm7
+; SSE-NEXT: pandn %xmm1, %xmm7
+; SSE-NEXT: pand %xmm12, %xmm6
+; SSE-NEXT: por %xmm6, %xmm7
+; SSE-NEXT: movdqa %xmm8, %xmm1
+; SSE-NEXT: psrlq $48, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm4, %xmm0
+; SSE-NEXT: pandn %xmm6, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
-; SSE-NEXT: movdqa %xmm7, %xmm4
-; SSE-NEXT: pandn %xmm1, %xmm4
-; SSE-NEXT: pand %xmm7, %xmm0
-; SSE-NEXT: por %xmm0, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
-; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3]
+; SSE-NEXT: movdqa %xmm12, %xmm3
+; SSE-NEXT: pandn %xmm1, %xmm3
+; SSE-NEXT: pand %xmm12, %xmm0
+; SSE-NEXT: por %xmm0, %xmm3
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,4,6]
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm12[1]
-; SSE-NEXT: movss {{.*#+}} xmm6 = xmm1[0],xmm6[1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,4,6]
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm13[1]
+; SSE-NEXT: movss {{.*#+}} xmm15 = xmm1[0],xmm15[1,2,3]
; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,4,6]
-; SSE-NEXT: movdqa %xmm7, %xmm2
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: andps %xmm7, %xmm6
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: # xmm1 = mem[1,1,1,1]
-; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: # xmm6 = mem[2,3,2,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
-; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
+; SSE-NEXT: movdqa %xmm12, %xmm2
+; SSE-NEXT: pandn %xmm0, %xmm2
+; SSE-NEXT: andps %xmm12, %xmm15
+; SSE-NEXT: por %xmm15, %xmm2
+; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: # xmm0 = mem[1,1,1,1]
+; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; SSE-NEXT: # xmm15 = mem[2,3,2,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; SSE-NEXT: pshufd $196, (%rsp), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[0,1,0,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,4,6]
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm13[1]
-; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,6]
-; SSE-NEXT: movdqa %xmm7, %xmm0
-; SSE-NEXT: pandn %xmm6, %xmm0
-; SSE-NEXT: andps %xmm7, %xmm1
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: psrlq $48, %xmm15
-; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
-; SSE-NEXT: psrld $16, %xmm12
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,5,7]
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm12[1]
-; SSE-NEXT: movss {{.*#+}} xmm1 = xmm10[0],xmm1[1,2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1]
+; SSE-NEXT: movss {{.*#+}} xmm0 = xmm15[0],xmm0[1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,6]
+; SSE-NEXT: movdqa %xmm12, %xmm1
+; SSE-NEXT: pandn %xmm15, %xmm1
+; SSE-NEXT: andps %xmm12, %xmm0
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: psrlq $48, %xmm6
+; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE-NEXT: psrld $16, %xmm13
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7]
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm13[1]
+; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm6[0,1,2,3,4,4,5,7]
-; SSE-NEXT: movdqa %xmm7, %xmm14
-; SSE-NEXT: pandn %xmm12, %xmm14
-; SSE-NEXT: andps %xmm7, %xmm1
-; SSE-NEXT: por %xmm1, %xmm14
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: psrlq $48, %xmm6
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE-NEXT: movdqa %xmm1, %xmm10
-; SSE-NEXT: psrld $16, %xmm13
-; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7]
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm13[1]
-; SSE-NEXT: movss {{.*#+}} xmm1 = xmm10[0],xmm1[1,2,3]
-; SSE-NEXT: andps %xmm7, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7]
-; SSE-NEXT: pandn %xmm3, %xmm7
-; SSE-NEXT: por %xmm1, %xmm7
-; SSE-NEXT: movaps %xmm8, 16(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, (%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, 16(%rdx)
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7]
+; SSE-NEXT: movdqa %xmm12, %xmm15
+; SSE-NEXT: pandn %xmm6, %xmm15
+; SSE-NEXT: andps %xmm12, %xmm0
+; SSE-NEXT: por %xmm0, %xmm15
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: psrlq $48, %xmm0
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE-NEXT: psrld $16, %xmm8
+; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7]
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1]
+; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
+; SSE-NEXT: andps %xmm12, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7]
+; SSE-NEXT: pandn %xmm4, %xmm12
+; SSE-NEXT: por %xmm0, %xmm12
+; SSE-NEXT: movaps %xmm10, 16(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, (%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 16(%rdx)
; SSE-NEXT: movaps %xmm9, (%rdx)
-; SSE-NEXT: movdqa %xmm5, 16(%rcx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, (%rcx)
-; SSE-NEXT: movdqa %xmm4, 16(%r8)
-; SSE-NEXT: movdqa %xmm11, (%r8)
-; SSE-NEXT: movdqa %xmm0, 16(%r9)
+; SSE-NEXT: movdqa %xmm11, 16(%rcx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, (%rcx)
+; SSE-NEXT: movdqa %xmm3, 16(%r8)
+; SSE-NEXT: movdqa %xmm7, (%r8)
+; SSE-NEXT: movdqa %xmm1, 16(%r9)
; SSE-NEXT: movdqa %xmm2, (%r9)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movdqa %xmm7, 16(%rax)
-; SSE-NEXT: movdqa %xmm14, (%rax)
-; SSE-NEXT: addq $72, %rsp
+; SSE-NEXT: movdqa %xmm12, 16(%rax)
+; SSE-NEXT: movdqa %xmm15, (%rax)
+; SSE-NEXT: addq $104, %rsp
; SSE-NEXT: retq
;
; AVX1-LABEL: vf16:
; AVX1: # %bb.0:
-; AVX1-NEXT: subq $184, %rsp
+; AVX1-NEXT: subq $104, %rsp
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,3]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7]
; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1
-; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX1-NEXT: vpslld $16, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa %xmm2, %xmm10
-; AVX1-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vmovdqa (%rdi), %xmm11
+; AVX1-NEXT: vmovdqa %xmm1, %xmm7
+; AVX1-NEXT: vmovdqa 80(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpslld $16, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm5
+; AVX1-NEXT: vmovdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm8
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm2
; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpsrlq $16, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqa %xmm3, %xmm7
+; AVX1-NEXT: vpsrlq $16, %xmm8, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,0,2,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,1,0,3]
-; AVX1-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7]
-; AVX1-NEXT: vmovdqa 176(%rdi), %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpslld $16, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa 160(%rdi), %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX1-NEXT: vmovdqa 128(%rdi), %xmm1
-; AVX1-NEXT: vpsrlq $16, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqa %xmm1, %xmm9
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[0,1,0,2,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,4,6,6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5],xmm11[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2],ymm5[3,4,5],ymm10[6,7]
+; AVX1-NEXT: vmovdqa 176(%rdi), %xmm10
+; AVX1-NEXT: vpslld $16, %xmm10, %xmm5
+; AVX1-NEXT: vmovdqa 160(%rdi), %xmm11
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm12 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
+; AVX1-NEXT: vmovdqa 128(%rdi), %xmm12
+; AVX1-NEXT: vpsrlq $16, %xmm12, %xmm0
; AVX1-NEXT: vmovdqa 144(%rdi), %xmm1
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,1,0,2,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
-; AVX1-NEXT: vandps %ymm2, %ymm4, %ymm4
+; AVX1-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
+; AVX1-NEXT: vandps %ymm3, %ymm14, %ymm3
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vmovaps %ymm2, %ymm14
-; AVX1-NEXT: vorps %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vandnps %ymm0, %ymm14, %ymm0
+; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpsrld $16, %xmm5, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,2,3,3]
-; AVX1-NEXT: vmovdqa %xmm10, %xmm13
-; AVX1-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm7, %xmm2
; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
-; AVX1-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vpsrld $16, %xmm6, %xmm7
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5],ymm3[6,7]
+; AVX1-NEXT: vpsrld $16, %xmm7, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,1,1]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
+; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpsrld $16, %xmm6, %xmm9
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,2,3,3]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
; AVX1-NEXT: vandps %ymm0, %ymm14, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vandnps %ymm1, %ymm14, %ymm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm11[2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3,4],xmm4[5,6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm5[0,1],xmm14[2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm1, %xmm10, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm13[4,5],xmm8[6,7]
-; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm15
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm15 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX1-NEXT: vandnps %ymm0, %ymm15, %ymm0
-; AVX1-NEXT: vandps %ymm1, %ymm15, %ymm1
+; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm5 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm5 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1,2,3],xmm13[4,5],xmm4[6,7]
+; AVX1-NEXT: vpshufb %xmm9, %xmm15, %xmm13
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vandnps %ymm0, %ymm13, %ymm0
+; AVX1-NEXT: vandps %ymm1, %ymm13, %ymm1
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,1,1]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm13 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm13[0],xmm1[0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1,2,3],xmm12[4,5],xmm7[6,7]
-; AVX1-NEXT: vpshufb %xmm5, %xmm13, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
-; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm7 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7]
+; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm9
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4],xmm9[5,6,7]
+; AVX1-NEXT: vandps %ymm0, %ymm14, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vandnps %ymm1, %ymm8, %ymm1
+; AVX1-NEXT: vandnps %ymm1, %ymm14, %ymm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa %xmm3, %xmm6
-; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm1
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3]
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
-; AVX1-NEXT: vpshufb %xmm5, %xmm10, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
-; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT: vandnps %ymm1, %ymm15, %ymm1
-; AVX1-NEXT: vandps %ymm3, %ymm15, %ymm3
-; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT: vpsrlq $48, %xmm11, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[2,2,3,3]
-; AVX1-NEXT: vmovdqa %xmm9, %xmm10
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm3[0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
-; AVX1-NEXT: vandps %ymm1, %ymm8, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX1-NEXT: vandnps %ymm3, %ymm8, %ymm3
-; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,3,2,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
-; AVX1-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,1,1,1]
+; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vmovdqa %xmm2, %xmm0
+; AVX1-NEXT: vpsrlq $48, %xmm2, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,2,3,3]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm9[0],xmm1[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
+; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
+; AVX1-NEXT: vpshufb %xmm3, %xmm15, %xmm9
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm2
+; AVX1-NEXT: vandnps %ymm1, %ymm13, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm13, %ymm2
+; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vpsrlq $48, %xmm4, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm9[0],xmm2[0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
+; AVX1-NEXT: vandps %ymm1, %ymm14, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2
+; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm14
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,2,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,4,6]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm6[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3,4,5,6,7]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX1-NEXT: vandnps %ymm1, %ymm7, %ymm1
-; AVX1-NEXT: vandps %ymm7, %ymm5, %ymm5
-; AVX1-NEXT: vorps %ymm1, %ymm5, %ymm1
-; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm5 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,4,6]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm11[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX1-NEXT: vpsrlq $48, %xmm9, %xmm1
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vpsrlq $48, %xmm8, %xmm3
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-NEXT: vpsrld $16, %xmm6, %xmm3
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,5,5,7]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm6[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
-; AVX1-NEXT: vandnps %ymm1, %ymm7, %ymm1
-; AVX1-NEXT: vandps %ymm7, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm11, %xmm3
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,5,5,7]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, (%rsi)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, (%rdx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, (%rcx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, (%r8)
-; AVX1-NEXT: vmovaps %ymm12, (%r9)
+; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm7
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,3,2,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,4,5,4,6]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm0[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3,4,5,6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vandnps %ymm1, %ymm9, %ymm1
+; AVX1-NEXT: vandps %ymm7, %ymm9, %ymm7
+; AVX1-NEXT: vorps %ymm1, %ymm7, %ymm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,5,4,6]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3,4],xmm3[5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX1-NEXT: vpsrlq $48, %xmm5, %xmm3
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm11 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
+; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vpsrlq $48, %xmm4, %xmm3
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm4
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,5,5,7]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
+; AVX1-NEXT: vandnps %ymm2, %ymm9, %ymm2
+; AVX1-NEXT: vandps %ymm3, %ymm9, %ymm3
+; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm2
+; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm3
+; AVX1-NEXT: vpsrld $16, %xmm12, %xmm4
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm10[0,1,2,3,4,5,5,7]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm3, (%rsi)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%rdx)
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%rcx)
+; AVX1-NEXT: vmovaps %ymm14, (%r8)
+; AVX1-NEXT: vmovaps %ymm1, (%r9)
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: vmovaps %ymm0, (%rax)
-; AVX1-NEXT: addq $184, %rsp
+; AVX1-NEXT: vmovaps %ymm2, (%rax)
+; AVX1-NEXT: addq $104, %rsp
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vf16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm14
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2
; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm5
-; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm15
+; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1
-; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9
-; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm2[2,3],ymm5[2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[0,2,2,1,4,6,6,5]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,1,4,6,6,5]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm7[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
-; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm2[0,1],ymm5[0,1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[0,3,2,3,4,7,6,7]
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,3,2,3,4,7,6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4,5,6],ymm11[7],ymm2[8,9],ymm11[10],ymm2[11,12,13,14],ymm11[15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,2,0,3]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3],xmm4[4,5],xmm1[6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,1,2,1,6,5,6,5]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11,12,13,14],ymm1[15]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,6,4]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,0,3,5,5,4,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,3,2,3,4,7,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3,4],ymm4[5,6],ymm8[7],ymm4[8],ymm8[9,10,11,12],ymm4[13,14],ymm8[15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,1,2,3]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm5
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3,4,5],ymm12[6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,1,2,1,6,5,6,5]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4,5,6],ymm9[7],ymm5[8,9],ymm9[10],ymm5[11,12,13,14],ymm9[15]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5],ymm9[6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,0,0,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[1,1,0,3,5,5,4,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[0,3,2,3,4,7,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3,4],ymm10[5,6],ymm13[7],ymm10[8],ymm13[9,10,11,12],ymm10[13,14],ymm13[15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[2,1,2,3]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,6,6,6,6]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm15[1,2],xmm2[3],xmm15[4,5,6,7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7],ymm10[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[1,1,1,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6],xmm2[7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm12[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3,4],ymm9[5,6],ymm10[7],ymm9[8],ymm10[9,10,11,12],ymm9[13,14],ymm10[15]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7],ymm9[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[3,1,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[0,1,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2,3],xmm4[4],xmm10[5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,2,1,6,5,6,5]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3,4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10,11,12],ymm10[13],ymm11[14,15]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3,4],ymm1[5],ymm6[6,7],ymm1[8],ymm6[9,10,11,12],ymm1[13],ymm6[14,15]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,1,0,2,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3,4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9,10,11,12],ymm1[13,14],ymm3[15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,2,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,2,1,6,5,6,5]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[0,1,0,3,4,5,4,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3,4],ymm4[5],ymm6[6,7],ymm4[8],ymm6[9,10,11,12],ymm4[13],ymm6[14,15]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,3,3,4,5,7,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3,4],ymm3[5],ymm5[6,7],ymm3[8],ymm5[9,10,11,12],ymm3[13],ymm5[14,15]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm1
-; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,0,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rdx)
; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx)
; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8)
; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9)
@@ -1431,125 +1415,121 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
;
; AVX2-FAST-LABEL: vf16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14
-; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm15
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3
+; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2
; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5
; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0
-; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm13
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
+; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u]
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9
-; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm2[2,3],ymm5[2,3]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[2,1,2,1,6,5,6,5]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10
+; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[2,1,2,1,6,5,6,5]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,12,13,u,u,u,u,16,17,u,u,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,3,2,3,4,7,6,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,u,u,16,17,20,21,u,u,22,23,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4,5,6],ymm12[7],ymm2[8,9],ymm12[10],ymm2[11,12,13,14],ymm12[15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm3
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3],xmm4[4,5],xmm1[6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5],ymm1[6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,u,u,18,19,22,23,u,u,22,23,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11,12,13,14],ymm1[15]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5],ymm3[6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,6,4]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[4,5,u,u,u,u,u,u,u,u,0,1,12,13,u,u,20,21,u,u,u,u,u,u,u,u,16,17,28,29,u,u]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,3,2,3,4,7,6,7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm9[u,u,0,1,4,5,6,7,8,9,u,u,u,u,8,9,u,u,16,17,20,21,22,23,24,25,u,u,u,u,24,25]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3,4],ymm4[5,6],ymm8[7],ymm4[8],ymm8[9,10,11,12],ymm4[13,14],ymm8[15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,1,2,3]
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,1,2,0,4,5,6,7]
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm2
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm6
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0,1,2],ymm6[3,4,5,6,7],ymm4[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,2,3,6,7,6,7,10,11,u,u,u,u,10,11,u,u,18,19,22,23,22,23,26,27,u,u,u,u,26,27]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3,4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9,10,11,12],ymm1[13,14],ymm3[15]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[3,1,2,1,4,5,6,7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[2,1,2,1,6,5,6,5]
-; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,4,5,4,5,4,5,8,9,u,u,8,9,12,13,u,u,20,21,20,21,20,21,24,25,u,u,24,25,28,29]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3,4],ymm1[5],ymm3[6,7],ymm1[8],ymm3[9,10,11,12],ymm1[13],ymm3[14,15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,0,3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5],ymm13[6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm11[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,u,u,18,19,22,23,u,u,22,23,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4,5,6],ymm9[7],ymm5[8,9],ymm9[10],ymm5[11,12,13,14],ymm9[15]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5],ymm9[6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
+; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[4,5,u,u,u,u,u,u,u,u,0,1,12,13,u,u,20,21,u,u,u,u,u,u,u,u,16,17,28,29,u,u]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[0,3,2,3,4,7,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm12[u,u,0,1,4,5,6,7,8,9,u,u,u,u,8,9,u,u,16,17,20,21,22,23,24,25,u,u,u,u,24,25]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3,4],ymm10[5,6],ymm13[7],ymm10[8],ymm13[9,10,11,12],ymm10[13,14],ymm13[15]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[2,1,2,3]
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,1,2,0,4,5,6,7]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm2[1,2],xmm15[3],xmm2[4,5,6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7],ymm10[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6],xmm2[7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,2,3,6,7,6,7,10,11,u,u,u,u,10,11,u,u,18,19,22,23,22,23,26,27,u,u,u,u,26,27]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3,4],ymm9[5,6],ymm10[7],ymm9[8],ymm10[9,10,11,12],ymm9[13,14],ymm10[15]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7],ymm9[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[3,1,2,1,4,5,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,1,2,1,6,5,6,5]
+; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm2[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,4,5,4,5,4,5,8,9,u,u,8,9,12,13,u,u,20,21,20,21,20,21,24,25,u,u,24,25,28,29]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1,2,3,4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10,11,12],ymm7[13],ymm10[14,15]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[2,2,2,2,4,5,6,7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
-; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6],ymm11[7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,u,u,u,u,u,u,u,u,14,15,u,u,u,u,18,19,u,u,u,u,u,u,u,u,30,31,u,u,u,u]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,6,7,6,7,6,7,8,9,u,u,10,11,14,15,u,u,22,23,22,23,22,23,24,25,u,u,26,27,30,31]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10,11,12],ymm0[13],ymm2[14,15]
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
+; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[2,3,u,u,u,u,u,u,u,u,14,15,u,u,u,u,18,19,u,u,u,u,u,u,u,u,30,31,u,u,u,u]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,6,7,6,7,6,7,8,9,u,u,10,11,14,15,u,u,22,23,22,23,22,23,24,25,u,u,26,27,30,31]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10,11,12],ymm1[13],ymm2[14,15]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7]
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3
-; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,1]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5],xmm2[6,7]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5],xmm2[6,7]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi)
-; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx)
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi)
+; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rdx)
; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rcx)
; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8)
-; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r9)
+; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9)
; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax)
; AVX2-FAST-NEXT: vzeroupper
@@ -1633,231 +1613,233 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-LABEL: vf32:
; SSE: # %bb.0:
; SSE-NEXT: subq $456, %rsp # imm = 0x1C8
-; SSE-NEXT: movdqa 208(%rdi), %xmm8
-; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 224(%rdi), %xmm6
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 208(%rdi), %xmm9
+; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 224(%rdi), %xmm7
+; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 64(%rdi), %xmm2
-; SSE-NEXT: movdqa 80(%rdi), %xmm7
+; SSE-NEXT: movdqa 80(%rdi), %xmm10
; SSE-NEXT: movdqa (%rdi), %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rdi), %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 16(%rdi), %xmm14
; SSE-NEXT: movdqa 32(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 48(%rdi), %xmm5
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT: movdqa %xmm10, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT: movdqa %xmm8, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE-NEXT: pand %xmm10, %xmm0
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE-NEXT: pand %xmm8, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm2, %xmm11
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm7, %xmm0
+; SSE-NEXT: movdqa %xmm10, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm7[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[2,3]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pslld $16, %xmm7
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,3,2,3]
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
+; SSE-NEXT: pslld $16, %xmm10
+; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,2,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,2,4,5,6,7]
-; SSE-NEXT: movdqa %xmm10, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7]
+; SSE-NEXT: movdqa %xmm8, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: movdqa 192(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
-; SSE-NEXT: pand %xmm10, %xmm0
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: movdqa %xmm8, %xmm10
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: movdqa 256(%rdi), %xmm4
; SSE-NEXT: movdqa 272(%rdi), %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0]
-; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0]
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm4, %xmm3
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0]
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3]
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; SSE-NEXT: movdqa 240(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0]
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 128(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
-; SSE-NEXT: movdqa %xmm10, %xmm3
+; SSE-NEXT: movdqa %xmm8, %xmm3
; SSE-NEXT: pandn %xmm0, %xmm3
-; SSE-NEXT: movdqa 112(%rdi), %xmm1
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 112(%rdi), %xmm12
; SSE-NEXT: movdqa 96(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT: pand %xmm10, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
+; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm8, %xmm0
; SSE-NEXT: por %xmm3, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: movdqa 160(%rdi), %xmm5
-; SSE-NEXT: movdqa 176(%rdi), %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3]
-; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pslld $16, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE-NEXT: movdqa 160(%rdi), %xmm4
+; SSE-NEXT: movdqa 176(%rdi), %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,3,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0]
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm4, %xmm3
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3]
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; SSE-NEXT: movdqa 144(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,0,2,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 320(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,1,2,4,5,6,7]
-; SSE-NEXT: movdqa %xmm10, %xmm4
-; SSE-NEXT: pandn %xmm3, %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
+; SSE-NEXT: movdqa %xmm8, %xmm3
+; SSE-NEXT: pandn %xmm0, %xmm3
; SSE-NEXT: movdqa 304(%rdi), %xmm15
-; SSE-NEXT: movdqa 288(%rdi), %xmm1
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3]
-; SSE-NEXT: pand %xmm10, %xmm1
-; SSE-NEXT: por %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: movdqa 352(%rdi), %xmm12
+; SSE-NEXT: movdqa 288(%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,6,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: por %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: movdqa 352(%rdi), %xmm0
; SSE-NEXT: movdqa 368(%rdi), %xmm11
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,2,3,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3]
-; SSE-NEXT: movdqa %xmm11, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[3,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
+; SSE-NEXT: movdqa %xmm11, %xmm8
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[3,0]
+; SSE-NEXT: movdqa %xmm0, %xmm9
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm12, %xmm9
-; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm11[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm11[2,3]
; SSE-NEXT: pslld $16, %xmm11
; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
; SSE-NEXT: movdqa 336(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,1,0,2,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm7[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0]
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm3, %xmm7
-; SSE-NEXT: psrld $16, %xmm7
-; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[0,1,0,2,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm11[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,0]
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm14, %xmm4
+; SSE-NEXT: movdqa %xmm14, %xmm9
+; SSE-NEXT: psrld $16, %xmm9
+; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; SSE-NEXT: movdqa %xmm10, %xmm9
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE-NEXT: pandn %xmm14, %xmm9
+; SSE-NEXT: pand %xmm10, %xmm0
+; SSE-NEXT: por %xmm9, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[0,1,1,3,4,5,6,7]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm9[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,0]
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm6, %xmm0
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pshufhw $237, (%rsp), %xmm11 # 16-byte Folded Reload
; SSE-NEXT: # xmm11 = mem[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
-; SSE-NEXT: movdqa %xmm10, %xmm7
+; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; SSE-NEXT: movdqa %xmm10, %xmm0
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT: pandn %xmm9, %xmm7
+; SSE-NEXT: pandn %xmm9, %xmm0
; SSE-NEXT: pand %xmm10, %xmm11
-; SSE-NEXT: por %xmm7, %xmm11
-; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = mem[0,1,1,3,4,5,6,7]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm7[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0]
+; SSE-NEXT: por %xmm0, %xmm11
+; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[2,0]
; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: psrld $16, %xmm12
+; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE-NEXT: movdqa %xmm10, %xmm0
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm11, %xmm7
-; SSE-NEXT: psrld $16, %xmm7
-; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; SSE-NEXT: movdqa %xmm10, %xmm7
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: pandn %xmm4, %xmm7
-; SSE-NEXT: pand %xmm10, %xmm0
-; SSE-NEXT: por %xmm7, %xmm0
-; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = mem[0,1,1,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm7[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: psrld $16, %xmm2
-; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3]
-; SSE-NEXT: movdqa %xmm10, %xmm2
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: pandn %xmm7, %xmm2
-; SSE-NEXT: pand %xmm10, %xmm13
-; SSE-NEXT: por %xmm2, %xmm13
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,1,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm2[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[2,0]
+; SSE-NEXT: pandn %xmm11, %xmm0
+; SSE-NEXT: pand %xmm10, %xmm2
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,0]
+; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
; SSE-NEXT: movdqa %xmm15, %xmm0
; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,7,6,7]
; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE-NEXT: pand %xmm10, %xmm2
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT: pandn %xmm14, %xmm10
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT: pandn %xmm7, %xmm10
; SSE-NEXT: por %xmm2, %xmm10
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,0]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0]
; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm9, %xmm0
+; SSE-NEXT: movdqa %xmm14, %xmm13
+; SSE-NEXT: movdqa %xmm14, %xmm0
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: por %xmm2, %xmm3
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,0,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm1, %xmm4
+; SSE-NEXT: por %xmm2, %xmm4
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
@@ -1865,156 +1847,154 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,5,4]
-; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
-; SSE-NEXT: movdqa %xmm10, %xmm6
-; SSE-NEXT: pandn %xmm2, %xmm6
-; SSE-NEXT: pand %xmm10, %xmm3
-; SSE-NEXT: por %xmm3, %xmm6
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm4, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0]
+; SSE-NEXT: movdqa %xmm12, %xmm5
+; SSE-NEXT: pandn %xmm2, %xmm5
+; SSE-NEXT: pand %xmm12, %xmm4
+; SSE-NEXT: por %xmm4, %xmm5
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm9, %xmm2
; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pandn %xmm2, %xmm3
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm11[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pandn %xmm2, %xmm4
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm6[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
+; SSE-NEXT: por %xmm4, %xmm2
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: shufps $132, (%rsp), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
-; SSE-NEXT: movdqa %xmm10, %xmm4
-; SSE-NEXT: pandn %xmm3, %xmm4
-; SSE-NEXT: pand %xmm10, %xmm2
-; SSE-NEXT: por %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm14, %xmm2
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
+; SSE-NEXT: movdqa %xmm12, %xmm6
+; SSE-NEXT: pandn %xmm4, %xmm6
+; SSE-NEXT: pand %xmm12, %xmm2
+; SSE-NEXT: por %xmm2, %xmm6
+; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm7, %xmm2
; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pandn %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pandn %xmm2, %xmm4
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm15[0,0]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm15[2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,2,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm4
-; SSE-NEXT: por %xmm3, %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm1, %xmm6
+; SSE-NEXT: por %xmm4, %xmm6
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,2]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,6,5,4]
-; SSE-NEXT: movdqa %xmm10, %xmm3
-; SSE-NEXT: pandn %xmm11, %xmm3
-; SSE-NEXT: pand %xmm10, %xmm4
-; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm7, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
+; SSE-NEXT: movdqa %xmm12, %xmm7
+; SSE-NEXT: pandn %xmm4, %xmm7
+; SSE-NEXT: pand %xmm12, %xmm6
+; SSE-NEXT: por %xmm6, %xmm7
+; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm11, %xmm7
+; SSE-NEXT: movdqa %xmm11, %xmm4
+; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,1,1]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pandn %xmm3, %xmm4
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,1,1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pandn %xmm4, %xmm6
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm7
-; SSE-NEXT: por %xmm4, %xmm7
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm1, %xmm8
+; SSE-NEXT: por %xmm6, %xmm8
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,2]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
-; SSE-NEXT: movdqa %xmm10, %xmm0
-; SSE-NEXT: pandn %xmm4, %xmm0
-; SSE-NEXT: pand %xmm10, %xmm7
-; SSE-NEXT: por %xmm7, %xmm0
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
+; SSE-NEXT: movdqa %xmm12, %xmm0
+; SSE-NEXT: pandn %xmm6, %xmm0
+; SSE-NEXT: pand %xmm12, %xmm8
+; SSE-NEXT: por %xmm8, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm9, %xmm4
-; SSE-NEXT: psrlq $48, %xmm4
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,2,3,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm4[0]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pandn %xmm7, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm5
-; SSE-NEXT: por %xmm4, %xmm5
-; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
-; SSE-NEXT: movdqa %xmm10, %xmm0
-; SSE-NEXT: pandn %xmm4, %xmm0
-; SSE-NEXT: pand %xmm10, %xmm5
-; SSE-NEXT: por %xmm5, %xmm0
+; SSE-NEXT: movdqa %xmm10, %xmm6
+; SSE-NEXT: psrlq $48, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[2,2,3,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0]
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pandn %xmm8, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: por %xmm6, %xmm3
+; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
+; SSE-NEXT: movdqa %xmm12, %xmm0
+; SSE-NEXT: pandn %xmm6, %xmm0
+; SSE-NEXT: pand %xmm12, %xmm3
+; SSE-NEXT: por %xmm3, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: psrlq $48, %xmm8
-; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; SSE-NEXT: # xmm5 = mem[2,2,3,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm8[0]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pandn %xmm5, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7]
+; SSE-NEXT: psrlq $48, %xmm9
+; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; SSE-NEXT: # xmm6 = mem[2,2,3,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0]
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pandn %xmm6, %xmm3
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
; SSE-NEXT: pand %xmm1, %xmm5
-; SSE-NEXT: por %xmm4, %xmm5
-; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
-; SSE-NEXT: movdqa %xmm10, %xmm0
-; SSE-NEXT: pandn %xmm4, %xmm0
-; SSE-NEXT: pand %xmm10, %xmm5
+; SSE-NEXT: por %xmm3, %xmm5
+; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
+; SSE-NEXT: movdqa %xmm12, %xmm0
+; SSE-NEXT: pandn %xmm3, %xmm0
+; SSE-NEXT: pand %xmm12, %xmm5
; SSE-NEXT: por %xmm5, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm11, %xmm4
-; SSE-NEXT: psrlq $48, %xmm4
-; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; SSE-NEXT: # xmm5 = mem[2,2,3,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pandn %xmm5, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
-; SSE-NEXT: movdqa %xmm10, %xmm11
-; SSE-NEXT: pandn %xmm4, %xmm11
-; SSE-NEXT: pand %xmm10, %xmm3
-; SSE-NEXT: por %xmm3, %xmm11
-; SSE-NEXT: movdqa %xmm14, %xmm3
+; SSE-NEXT: movdqa %xmm11, %xmm9
+; SSE-NEXT: movdqa %xmm11, %xmm3
; SSE-NEXT: psrlq $48, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,2,3,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pandn %xmm5, %xmm3
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm1, %xmm4
+; SSE-NEXT: por %xmm3, %xmm4
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
+; SSE-NEXT: movdqa %xmm12, %xmm11
+; SSE-NEXT: pandn %xmm3, %xmm11
+; SSE-NEXT: pand %xmm12, %xmm4
+; SSE-NEXT: por %xmm4, %xmm11
+; SSE-NEXT: psrlq $48, %xmm14
; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
; SSE-NEXT: # xmm4 = mem[2,2,3,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm14[0]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
@@ -2024,32 +2004,31 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
-; SSE-NEXT: movdqa %xmm10, %xmm14
-; SSE-NEXT: pandn %xmm2, %xmm14
-; SSE-NEXT: pand %xmm10, %xmm1
-; SSE-NEXT: por %xmm1, %xmm14
+; SSE-NEXT: movdqa %xmm12, %xmm7
+; SSE-NEXT: pandn %xmm2, %xmm7
+; SSE-NEXT: pand %xmm12, %xmm1
+; SSE-NEXT: por %xmm1, %xmm7
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = mem[2,3,2,3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,1,0,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,4,6]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm6[1]
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm10[1]
; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
-; SSE-NEXT: movdqa %xmm10, %xmm9
-; SSE-NEXT: pandn %xmm2, %xmm9
-; SSE-NEXT: andps %xmm10, %xmm3
-; SSE-NEXT: por %xmm3, %xmm9
-; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT: # xmm2 = mem[1,1,1,1]
+; SSE-NEXT: movdqa %xmm12, %xmm6
+; SSE-NEXT: pandn %xmm2, %xmm6
+; SSE-NEXT: andps %xmm12, %xmm3
+; SSE-NEXT: por %xmm3, %xmm6
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[0,1,0,3]
@@ -2062,9 +2041,9 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6]
-; SSE-NEXT: movdqa %xmm10, %xmm5
+; SSE-NEXT: movdqa %xmm12, %xmm5
; SSE-NEXT: pandn %xmm3, %xmm5
-; SSE-NEXT: andps %xmm10, %xmm2
+; SSE-NEXT: andps %xmm12, %xmm2
; SSE-NEXT: por %xmm2, %xmm5
; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = mem[1,1,1,1]
@@ -2075,16 +2054,15 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: # xmm1 = mem[0,1,0,3]
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,6]
-; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT: # xmm2 = xmm2[1],mem[1]
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1]
; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6]
-; SSE-NEXT: movdqa %xmm10, %xmm4
+; SSE-NEXT: movdqa %xmm12, %xmm4
; SSE-NEXT: pandn %xmm1, %xmm4
-; SSE-NEXT: andps %xmm10, %xmm2
+; SSE-NEXT: andps %xmm12, %xmm2
; SSE-NEXT: por %xmm2, %xmm4
; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[1,1,1,1]
@@ -2098,84 +2076,85 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = xmm1[1],mem[1]
; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
-; SSE-NEXT: movdqa %xmm10, %xmm3
+; SSE-NEXT: movdqa %xmm12, %xmm3
; SSE-NEXT: pandn %xmm2, %xmm3
-; SSE-NEXT: andps %xmm10, %xmm1
+; SSE-NEXT: andps %xmm12, %xmm1
; SSE-NEXT: por %xmm1, %xmm3
; SSE-NEXT: psrlq $48, %xmm0
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE-NEXT: psrld $16, %xmm6
+; SSE-NEXT: psrld $16, %xmm10
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7]
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1]
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1]
; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,5,7]
-; SSE-NEXT: movdqa %xmm10, %xmm1
+; SSE-NEXT: movdqa %xmm12, %xmm1
; SSE-NEXT: pandn %xmm8, %xmm1
-; SSE-NEXT: andps %xmm10, %xmm2
+; SSE-NEXT: andps %xmm12, %xmm2
; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: psrlq $48, %xmm6
-; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: psrld $16, %xmm7
+; SSE-NEXT: psrlq $48, %xmm15
+; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; SSE-NEXT: psrld $16, %xmm9
; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7]
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1]
-; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3]
-; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,7]
-; SSE-NEXT: movdqa %xmm10, %xmm15
-; SSE-NEXT: pandn %xmm7, %xmm15
-; SSE-NEXT: andps %xmm10, %xmm2
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1]
+; SSE-NEXT: movss {{.*#+}} xmm2 = xmm13[0],xmm2[1,2,3]
+; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7]
+; SSE-NEXT: movdqa %xmm12, %xmm15
+; SSE-NEXT: pandn %xmm8, %xmm15
+; SSE-NEXT: andps %xmm12, %xmm2
; SSE-NEXT: por %xmm2, %xmm15
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: psrlq $48, %xmm0
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: psrld $16, %xmm7
+; SSE-NEXT: movdqa %xmm2, %xmm8
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; SSE-NEXT: psrld $16, %xmm9
; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7]
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1]
-; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3]
-; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,5,7]
-; SSE-NEXT: movdqa %xmm10, %xmm7
-; SSE-NEXT: pandn %xmm8, %xmm7
-; SSE-NEXT: andps %xmm10, %xmm2
-; SSE-NEXT: por %xmm2, %xmm7
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: psrlq $48, %xmm6
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1]
+; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3]
+; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7]
+; SSE-NEXT: movdqa %xmm12, %xmm13
+; SSE-NEXT: pandn %xmm8, %xmm13
+; SSE-NEXT: andps %xmm12, %xmm2
+; SSE-NEXT: por %xmm2, %xmm13
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: psrlq $48, %xmm8
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
; SSE-NEXT: movdqa %xmm2, %xmm8
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: psrld $16, %xmm6
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; SSE-NEXT: psrld $16, %xmm9
; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7]
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1]
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1]
; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3]
-; SSE-NEXT: andps %xmm10, %xmm2
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7]
-; SSE-NEXT: pandn %xmm6, %xmm10
-; SSE-NEXT: por %xmm2, %xmm10
+; SSE-NEXT: andps %xmm12, %xmm2
+; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7]
+; SSE-NEXT: pandn %xmm8, %xmm12
+; SSE-NEXT: por %xmm2, %xmm12
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 48(%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -2186,7 +2165,8 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: movaps %xmm2, (%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 48(%rdx)
-; SSE-NEXT: movaps %xmm13, 16(%rdx)
+; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
+; SSE-NEXT: movaps %xmm2, 16(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 32(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -2195,11 +2175,11 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: movaps %xmm2, 16(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 48(%rcx)
-; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 32(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, (%rcx)
-; SSE-NEXT: movdqa %xmm14, 48(%r8)
+; SSE-NEXT: movdqa %xmm7, 48(%r8)
; SSE-NEXT: movdqa %xmm11, 16(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 32(%r8)
@@ -2208,10 +2188,10 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; SSE-NEXT: movdqa %xmm3, 48(%r9)
; SSE-NEXT: movdqa %xmm4, 16(%r9)
; SSE-NEXT: movdqa %xmm5, 32(%r9)
-; SSE-NEXT: movdqa %xmm9, (%r9)
+; SSE-NEXT: movdqa %xmm6, (%r9)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movdqa %xmm10, 48(%rax)
-; SSE-NEXT: movdqa %xmm7, 16(%rax)
+; SSE-NEXT: movdqa %xmm12, 48(%rax)
+; SSE-NEXT: movdqa %xmm13, 16(%rax)
; SSE-NEXT: movdqa %xmm15, 32(%rax)
; SSE-NEXT: movdqa %xmm1, (%rax)
; SSE-NEXT: addq $456, %rsp # imm = 0x1C8
@@ -2219,7 +2199,7 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
;
; AVX1-LABEL: vf32:
; AVX1: # %bb.0:
-; AVX1-NEXT: subq $584, %rsp # imm = 0x248
+; AVX1-NEXT: subq $536, %rsp # imm = 0x218
; AVX1-NEXT: vmovdqa 288(%rdi), %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
@@ -2235,154 +2215,154 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovdqa 224(%rdi), %xmm1
-; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpsrlq $16, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT: vmovdqa 224(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpsrlq $16, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa 240(%rdi), %xmm2
; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT: vmovdqa 192(%rdi), %xmm2
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX1-NEXT: vmovdqa 208(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
-; AVX1-NEXT: vmovdqa 368(%rdi), %xmm1
-; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpslld $16, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa 352(%rdi), %xmm2
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-NEXT: vmovdqa 320(%rdi), %xmm2
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpsrlq $16, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa 336(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm9
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX1-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
+; AVX1-NEXT: vmovdqa 208(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7]
+; AVX1-NEXT: vmovdqa 368(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpslld $16, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa 352(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT: vmovdqa 320(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpsrlq $16, %xmm0, %xmm4
+; AVX1-NEXT: vmovdqa 336(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,1,0,2,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5],xmm2[6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm2
+; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa 80(%rdi), %xmm12
-; AVX1-NEXT: vpslld $16, %xmm12, %xmm0
-; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa 64(%rdi), %xmm11
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vmovdqa 80(%rdi), %xmm11
+; AVX1-NEXT: vpslld $16, %xmm11, %xmm1
; AVX1-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vmovdqa 64(%rdi), %xmm10
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
-; AVX1-NEXT: vmovdqa 112(%rdi), %xmm14
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm15
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm13
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7]
+; AVX1-NEXT: vmovdqa 112(%rdi), %xmm13
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm5
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm8
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpsrlq $16, %xmm13, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,0,2,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; AVX1-NEXT: vpsrlq $16, %xmm8, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,1,0,2,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm8
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4,5],xmm7[6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm15[3,4,5],ymm5[6,7]
-; AVX1-NEXT: vmovdqa 176(%rdi), %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpslld $16, %xmm0, %xmm5
-; AVX1-NEXT: vmovdqa 160(%rdi), %xmm10
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm7 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; AVX1-NEXT: vmovdqa 128(%rdi), %xmm6
-; AVX1-NEXT: vpsrlq $16, %xmm6, %xmm7
-; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm12
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,6,6,7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4,5],xmm9[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5],ymm6[6,7]
+; AVX1-NEXT: vmovdqa 176(%rdi), %xmm14
+; AVX1-NEXT: vpslld $16, %xmm14, %xmm6
+; AVX1-NEXT: vmovdqa 160(%rdi), %xmm7
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm9 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX1-NEXT: vmovdqa 128(%rdi), %xmm3
+; AVX1-NEXT: vpsrlq $16, %xmm3, %xmm9
; AVX1-NEXT: vmovdqa 144(%rdi), %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,1,0,2,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5],xmm5[6,7]
-; AVX1-NEXT: vandps %ymm4, %ymm9, %ymm4
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-NEXT: vandnps %ymm5, %ymm9, %ymm5
-; AVX1-NEXT: vmovaps %ymm9, %ymm7
-; AVX1-NEXT: vorps %ymm5, %ymm4, %ymm4
-; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
-; AVX1-NEXT: vpsrld $16, %xmm14, %xmm4
-; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,2,3,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm13, %xmm11
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5],xmm6[6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
+; AVX1-NEXT: vandps %ymm5, %ymm9, %ymm5
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX1-NEXT: vandnps %ymm6, %ymm9, %ymm6
+; AVX1-NEXT: vorps %ymm6, %ymm5, %ymm5
+; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
; AVX1-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
-; AVX1-NEXT: vpsrld $16, %xmm8, %xmm4
-; AVX1-NEXT: vmovdqa %xmm8, %xmm9
+; AVX1-NEXT: vpsrld $16, %xmm13, %xmm5
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,3,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,1,1,1]
+; AVX1-NEXT: vmovdqa %xmm8, %xmm10
; AVX1-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5],ymm1[6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
+; AVX1-NEXT: vmovdqa %xmm12, %xmm9
+; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpsrld $16, %xmm12, %xmm5
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4,5],xmm4[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
+; AVX1-NEXT: vmovdqa %xmm3, %xmm15
+; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,2,3,3]
-; AVX1-NEXT: vmovdqa %xmm10, %xmm13
-; AVX1-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3]
+; AVX1-NEXT: vmovdqa %xmm14, %xmm8
+; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-NEXT: vandps %ymm7, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm0, %ymm7, %ymm0
-; AVX1-NEXT: vmovaps %ymm7, %ymm4
+; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovaps %ymm2, %ymm3
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vpsrld $16, %xmm7, %xmm1
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm1 = mem[2,2,3,3]
-; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX1-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpsrld $16, %xmm5, %xmm3
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX1-NEXT: vpsrld $16, %xmm14, %xmm4
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
; AVX1-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -2390,785 +2370,770 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: # xmm2 = mem[1,1,1,1]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3]
+; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vandnps %ymm1, %ymm4, %ymm1
+; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: # xmm0 = mem[1,1,1,1]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15]
-; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: vpblendw $12, (%rsp), %xmm9, %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: # xmm2 = xmm9[0,1],mem[2,3],xmm9[4,5,6,7]
; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
-; AVX1-NEXT: vpblendw $12, (%rsp), %xmm14, %xmm2 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm2 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
+; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm11 = xmm13[0,1],mem[2,3],xmm13[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7]
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm13 = mem[0,1,2,3],xmm3[4,5],mem[6,7]
+; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX1-NEXT: vandnps %ymm1, %ymm12, %ymm1
-; AVX1-NEXT: vandps %ymm2, %ymm12, %ymm2
+; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: # xmm2 = mem[1,1,1,1]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm10[4,5],xmm13[6,7]
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm4
+; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm4 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm4 = mem[0,1,2,3],xmm8[4,5],mem[6,7]
+; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
+; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX1-NEXT: vandnps %ymm2, %ymm3, %ymm2
-; AVX1-NEXT: vmovaps %ymm3, %ymm4
+; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2
; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: # xmm1 = mem[1,1,1,1]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm14 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm2
+; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm4 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
-; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm2 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm2 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5],mem[6,7]
-; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm13
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0
-; AVX1-NEXT: vandnps %ymm15, %ymm12, %ymm13
-; AVX1-NEXT: vandps %ymm0, %ymm12, %ymm0
-; AVX1-NEXT: vorps %ymm0, %ymm13, %ymm13
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm7 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm6[0,1,2,3],xmm8[4,5],xmm6[6,7]
-; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
-; AVX1-NEXT: vandps %ymm4, %ymm13, %ymm7
+; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm2 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7]
+; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm14
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0
+; AVX1-NEXT: vandnps %ymm15, %ymm3, %ymm14
+; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm0, %ymm14, %ymm14
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm15 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0]
+; AVX1-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7]
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4],xmm5[5,6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
+; AVX1-NEXT: vandps %ymm6, %ymm14, %ymm14
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-NEXT: vandnps %ymm5, %ymm4, %ymm5
-; AVX1-NEXT: vmovaps %ymm4, %ymm13
-; AVX1-NEXT: vorps %ymm5, %ymm7, %ymm0
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vandnps %ymm5, %ymm6, %ymm5
+; AVX1-NEXT: vorps %ymm5, %ymm14, %ymm5
+; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX1-NEXT: vpsrlq $48, %xmm15, %xmm5
+; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm14 = mem[2,2,3,3]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm14[0],xmm5[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm14, %xmm6, %xmm9
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3,4],xmm9[5,6,7]
+; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
+; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm13
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7
+; AVX1-NEXT: vandnps %ymm5, %ymm3, %ymm5
+; AVX1-NEXT: vandps %ymm3, %ymm7, %ymm7
+; AVX1-NEXT: vorps %ymm5, %ymm7, %ymm5
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm11, %xmm5
-; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm7 = mem[2,2,3,3]
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm8
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6
-; AVX1-NEXT: vandnps %ymm5, %ymm12, %ymm5
-; AVX1-NEXT: vandps %ymm6, %ymm12, %ymm6
-; AVX1-NEXT: vorps %ymm5, %ymm6, %ymm5
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm8, %xmm6
-; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm4 = mem[2,2,3,3]
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3,4],xmm6[5,6,7]
-; AVX1-NEXT: vmovaps %ymm13, %ymm6
-; AVX1-NEXT: vandps %ymm5, %ymm13, %ymm5
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX1-NEXT: vandnps %ymm4, %ymm13, %ymm4
-; AVX1-NEXT: vorps %ymm4, %ymm5, %ymm4
-; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vpsrlq $48, %xmm11, %xmm7
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,2,3,3]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm13[0],xmm7[0]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
+; AVX1-NEXT: vandps %ymm7, %ymm5, %ymm5
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX1-NEXT: vandnps %ymm6, %ymm7, %ymm6
+; AVX1-NEXT: vorps %ymm6, %ymm5, %ymm5
+; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm13, %xmm4
-; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm5 = mem[2,2,3,3]
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX1-NEXT: vpshufb %xmm7, %xmm14, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq $48, %xmm13, %xmm5
+; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm6 = mem[2,2,3,3]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0]
+; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3,4],xmm4[5,6,7]
+; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vandnps %ymm4, %ymm12, %ymm2
-; AVX1-NEXT: vandps %ymm1, %ymm12, %ymm1
+; AVX1-NEXT: vandnps %ymm4, %ymm3, %ymm2
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3]
+; AVX1-NEXT: vmovdqa %xmm10, %xmm14
+; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3]
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT: vpshufb %xmm10, %xmm15, %xmm0
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
-; AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm7, %ymm1, %ymm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm0, %ymm6, %ymm0
+; AVX1-NEXT: vandnps %ymm0, %ymm7, %ymm0
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm0 = mem[1,1,1,1]
-; AVX1-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm1 = mem[2,3,2,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
-; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: # xmm1 = mem[1,1,1,1]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,3,2,3]
+; AVX1-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm2 = mem[2,3,2,3]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm2 = mem[0,1,0,3]
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm11[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm3 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm14 = mem[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,4,6]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1]
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm0 = mem[1,1,1,1]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,3,2,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm12 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm5, %xmm12, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,3,2,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
-; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm1 = mem[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,6]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm13[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7]
-; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm3
+; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm6 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
+; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm2 = mem[1,1,1,1]
+; AVX1-NEXT: vpshufd $238, (%rsp), %xmm4 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm4 = mem[2,3,2,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm7 = mem[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,4,6]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm15[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm4, %ymm4
-; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm6
+; AVX1-NEXT: vorps %ymm1, %ymm4, %ymm1
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm10 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm5, %xmm10, %xmm7
-; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm4 = mem[0,1,0,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,4,6]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm5[5,6,7]
+; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm4 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,4,6]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm9
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,3,2,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm5
-; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm7
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5
+; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm8 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm8, %xmm10
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
+; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm10 = mem[1,1,1,1]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm10 = mem[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,4,6]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm13[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3,4,5,6,7]
+; AVX1-NEXT: vandnps %ymm9, %ymm2, %ymm9
+; AVX1-NEXT: vandps %ymm2, %ymm11, %ymm11
+; AVX1-NEXT: vorps %ymm9, %ymm11, %ymm11
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX1-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm9 = mem[0,1],xmm9[2,3],mem[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm9, %xmm3
+; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm13 = mem[0,1,0,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5,4,6]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm14[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4],xmm3[5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX1-NEXT: vpsrlq $48, %xmm11, %xmm11
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
+; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vpsrlq $48, %xmm12, %xmm12
+; AVX1-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX1-NEXT: vpsrld $16, %xmm14, %xmm14
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,7]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm14[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3,4,5,6,7]
+; AVX1-NEXT: vandnps %ymm6, %ymm2, %ymm6
+; AVX1-NEXT: vandps %ymm2, %ymm7, %ymm7
+; AVX1-NEXT: vorps %ymm6, %ymm7, %ymm6
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX1-NEXT: vpsrld $16, %xmm7, %xmm7
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,7]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm7[1]
+; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7]
+; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm5
; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm6
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm7
-; AVX1-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,7]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm8[1],xmm7[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7]
; AVX1-NEXT: vandnps %ymm5, %ymm2, %ymm5
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm5, %ymm0, %ymm0
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,5,7]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm6[1],xmm5[1]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm13, %xmm6, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm5[5,6,7]
+; AVX1-NEXT: vandps %ymm2, %ymm6, %ymm2
+; AVX1-NEXT: vorps %ymm5, %ymm2, %ymm2
+; AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm5
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm6
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
-; AVX1-NEXT: vpsrlq $48, %xmm9, %xmm6
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vpsrld $16, %xmm7, %xmm7
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vpshufb %xmm13, %xmm10, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm3, %xmm2
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, (%rsi)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, 32(%rsi)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, 32(%rdx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, (%rdx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, 32(%rcx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, (%rcx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, 32(%r8)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, (%r8)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, 32(%r9)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, (%r9)
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm6
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,4,5,5,7]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%rsi)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, 32(%rsi)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%rdx)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, 32(%rcx)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%rcx)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, 32(%r8)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%r8)
+; AVX1-NEXT: vmovaps %ymm3, 32(%r9)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%r9)
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX1-NEXT: vmovaps %ymm5, (%rax)
-; AVX1-NEXT: addq $584, %rsp # imm = 0x248
+; AVX1-NEXT: vmovaps %ymm2, 32(%rax)
+; AVX1-NEXT: vmovaps %ymm4, (%rax)
+; AVX1-NEXT: addq $536, %rsp # imm = 0x218
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vf32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: subq $520, %rsp # imm = 0x208
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9
+; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6
-; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2
; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3
-; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[0,2,2,1,4,6,6,5]
-; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
+; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,1,4,6,6,5]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
-; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,3,2,3,4,7,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6
+; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5,6],ymm4[7],ymm2[8,9],ymm4[10],ymm2[11,12,13,14],ymm4[15]
; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3]
-; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm1[0,1]
-; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm0[0,1],ymm1[0,1]
+; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm0
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,2,2,2,4,5,6,7]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,1,2,1,6,5,6,5]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[2,1,2,1,6,5,6,5]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0],ymm11[1,2,3,4],ymm0[5],ymm11[6,7],ymm0[8],ymm11[9,10,11,12],ymm0[13],ymm11[14,15]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm11, %ymm0
-; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0
-; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9
-; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm8
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm11, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm7
+; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm5
+; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7]
+; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm11
+; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm14
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,2,2,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6,7]
+; AVX2-SLOW-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX2-SLOW-NEXT: vpshufd $102, (%rsp), %ymm0 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm0 = mem[2,1,2,1,6,5,6,5]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm7[1,2,3,4],ymm15[5],ymm7[6,7],ymm15[8],ymm7[9,10,11,12],ymm15[13],ymm7[14,15]
-; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm6
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,1,2,1,6,5,6,5]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm0[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7],ymm8[8],ymm7[9,10,11,12],ymm8[13],ymm7[14,15]
+; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm15, %ymm7, %ymm6
; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,3,3,4,5,7,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3,4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10,11,12],ymm6[13],ymm7[14,15]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
-; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm13[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3,4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10,11,12],ymm7[13],ymm8[14,15]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
-; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm6, %ymm1
+; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1
; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10,11,12],ymm0[13],ymm1[14,15]
-; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm1
+; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,3]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
-; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,0,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5,6],ymm5[7]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[0,2,0,3]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
; AVX2-SLOW-NEXT: vpblendd $56, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3,4,5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm12
+; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1
+; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3
; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm4
-; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm7
+; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm7
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3],xmm7[4,5],xmm1[6],xmm7[7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0
-; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3
-; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm6
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,2,2,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,1,2,2]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2],xmm1[3],xmm6[4,5],xmm1[6],xmm6[7]
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[0,2,2,1,4,6,6,5]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
-; AVX2-SLOW-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm13 = mem[0,3,2,3,4,7,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm8
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,2,2,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,2]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm13[3],xmm8[4,5],xmm13[6],xmm8[7]
+; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm1
+; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[0,2,2,1,4,6,6,5]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,3,2,3,4,7,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,2,2,4,5,6,6]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0,1],ymm6[2],ymm11[3,4,5,6],ymm6[7],ymm11[8,9],ymm6[10],ymm11[11,12,13,14],ymm6[15]
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm11 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm10
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,2,0,3]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1],xmm1[2],xmm10[3],xmm1[4,5],xmm10[6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm6
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4,5,6],ymm13[7],ymm11[8,9],ymm13[10],ymm11[11,12,13,14],ymm13[15]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm13 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5,6],mem[7]
+; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm10
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm3
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[0,2,0,3]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3],xmm14[4,5],xmm10[6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5],ymm10[6,7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7],ymm10[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,1,2,1,6,5,6,5]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm8 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm1[2],ymm8[3,4,5,6],ymm1[7],ymm8[8,9],ymm1[10],ymm8[11,12,13,14],ymm1[15]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
-; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm6
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
-; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm10[2,1,2,1,6,5,6,5]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm11 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5,6],ymm8[7],ymm11[8,9],ymm8[10],ymm11[11,12,13,14],ymm8[15]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm6
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3],xmm6[4,5],xmm2[6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5],ymm2[6,7]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,5,5,5,5]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
-; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm3
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,2,1,6,5,6,5]
-; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm1
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11,12,13,14],ymm3[15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm2
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
+; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm3
+; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm2
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[2,1,2,1,6,5,6,5]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm2[3,4,5],ymm4[6,7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpshufd $197, (%rsp), %ymm0 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,0,3,5,5,4,7]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[1,1,0,3,5,5,4,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm10[0,3,2,3,4,7,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3,4],ymm0[5,6],ymm4[7],ymm0[8],ymm4[9,10,11,12],ymm0[13,14],ymm4[15]
-; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm4 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,1,2,1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,1,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,0,0,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[0,3,2,3,4,7,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3,4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9,10,11,12],ymm0[13,14],ymm3[15]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,2,1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,0,0,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,6,5,6,4]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,6,4]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5,6],xmm7[7]
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm7 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm7 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[2,1,2,3]
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[2,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1,2],xmm13[3],xmm8[4,5,6,7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3,4],xmm0[5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
+; AVX2-SLOW-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[0,1,2,1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,0,0,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,6,4]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3],xmm0[4],xmm5[5,6],xmm0[7]
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,1,0,3,5,5,4,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[0,3,2,3,4,7,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,6,5,6,4]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5,6],xmm14[7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[1,1,0,3,5,5,4,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15]
+; AVX2-SLOW-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm1 = mem[0,3,2,3,4,7,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1,2,3,4],ymm5[5,6],ymm10[7],ymm5[8],ymm10[9,10,11,12],ymm5[13,14],ymm10[15]
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,1,2,3]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm6
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[2,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm4
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7],ymm5[8,9,10],ymm4[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u>
-; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm4
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm15[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3,4],ymm4[5,6],ymm9[7],ymm4[8],ymm9[9,10,11,12],ymm4[13,14],ymm9[15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,5,6,5]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[1,1,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5,6],xmm5[7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm10[1,2,3,4],ymm14[5,6],ymm10[7],ymm14[8],ymm10[9,10,11,12],ymm14[13,14],ymm10[15]
+; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm14 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,1,2,3]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,3,2,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm8[1,2],xmm15[3],xmm8[4,5,6,7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7],ymm10[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u>
+; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm9, %ymm10
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1,2,3,4],ymm10[5,6],ymm4[7],ymm10[8],ymm4[9,10,11,12],ymm10[13,14],ymm4[15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5,6],xmm6[7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1,2],xmm5[3],xmm7[4,5,6,7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,5,6,5]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
-; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm3
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3,4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9,10,11,12],ymm3[13,14],ymm4[15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm12, %ymm4
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4],ymm4[5,6],ymm1[7],ymm4[8],ymm1[9,10,11,12],ymm4[13,14],ymm1[15]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,1,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,3,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2],xmm0[3],xmm4[4,5,6,7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm2 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd $146, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
-; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm7
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6,7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm5
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6,7]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm6
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
-; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7]
+; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm2
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm3
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%r8)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rdx)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%r8)
; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8)
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r9)
+; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9)
; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9)
; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax)
-; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax)
; AVX2-SLOW-NEXT: addq $520, %rsp # imm = 0x208
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: vf32:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: subq $568, %rsp # imm = 0x238
+; AVX2-FAST-NEXT: subq $552, %rsp # imm = 0x228
; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm0
; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm1
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm9
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm8
+; AVX2-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm11
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2
; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3
-; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3]
-; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm3[0,1]
-; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[2,3],ymm3[2,3]
+; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm2[0,1],ymm3[0,1]
; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm1[2,3],ymm0[2,3]
-; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm1[0,1],ymm0[0,1]
+; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm1[0,1],ymm0[0,1]
+; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,4,5,4,5,4,5,8,9,u,u,8,9,12,13,u,u,20,21,20,21,20,21,24,25,u,u,24,25,28,29>
; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,4,5,4,5,4,5,8,9,u,u,8,9,12,13,u,u,20,21,20,21,20,21,24,25,u,u,24,25,28,29>
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,1,0,3,4,5,4,7]
+; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,0,3,4,5,4,7]
-; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm0
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[2,1,2,1,6,5,6,5]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,1,2,1,6,5,6,5]
; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7],ymm2[8],ymm0[9,10,11,12],ymm2[13],ymm0[14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7],ymm2[8],ymm0[9,10,11,12],ymm2[13],ymm0[14,15]
; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7]
-; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
-; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm1
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[2,2,2,2,4,5,6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
-; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,0,3,4,5,4,7]
-; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm1
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[2,1,2,1,6,5,6,5]
-; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3,4],ymm4[5],ymm1[6,7],ymm4[8],ymm1[9,10,11,12],ymm4[13],ymm1[14,15]
-; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1
-; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm0
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,2,2,2,4,5,6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
+; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm0
+; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm0
+; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,0,3,4,5,4,7]
+; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm4
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,2,1,6,5,6,5]
+; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm13[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1,2,3,4],ymm10[5],ymm4[6,7],ymm10[8],ymm4[9,10,11,12],ymm10[13],ymm4[14,15]
+; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6
; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7]
-; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm6
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm1
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,2,2,2,4,5,6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6,7]
-; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm4, %ymm4
+; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm10
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7]
+; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm15
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[2,2,2,2,4,5,6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0],xmm6[1],xmm15[2,3],xmm6[4],xmm15[5,6,7]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm4, %ymm4
; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,3,u,u,u,u,u,u,u,u,14,15,u,u,u,u,18,19,u,u,u,u,u,u,u,u,30,31,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,3,u,u,u,u,u,u,u,u,14,15,u,u,u,u,18,19,u,u,u,u,u,u,u,u,30,31,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,6,7,6,7,6,7,8,9,u,u,10,11,14,15,u,u,22,23,22,23,22,23,24,25,u,u,26,27,30,31>
-; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm7
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2,3,4],ymm3[5],ymm7[6,7],ymm3[8],ymm7[9,10,11,12],ymm3[13],ymm7[14,15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
-; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6,7]
-; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm8
-; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0
-; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm2
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10,11,12],ymm0[13],ymm2[14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,12,13,u,u,u,u,16,17,u,u,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,2,1,6,5,6,5]
+; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4],ymm3[5],ymm1[6,7],ymm3[8],ymm1[9,10,11,12],ymm3[13],ymm1[14,15]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm8
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6,7]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
+; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm1
+; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm2
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10,11,12],ymm1[13],ymm2[14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm2
+; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm3
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,12,13,u,u,u,u,16,17,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,1,2,1,6,5,6,5]
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,8,9,u,u,16,17,20,21,u,u,22,23,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: # ymm0 = mem[0,3,2,3,4,7,6,7]
-; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm6
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0,1],ymm1[2],ymm6[3,4,5,6],ymm1[7],ymm6[8,9],ymm1[10],ymm6[11,12,13,14],ymm1[15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6],ymm11[7]
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,1,0,3]
-; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
-; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm5
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3,4,5],ymm1[6,7]
-; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0
-; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2
+; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,3,2,3,4,7,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u]
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2],xmm12[3],xmm15[4,5],xmm12[6],xmm15[7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,1,2,1,6,5,6,5]
-; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[0,3,2,3,4,7,6,7]
-; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm12
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1],ymm1[2],ymm12[3,4,5,6],ymm1[7],ymm12[8,9],ymm1[10],ymm12[11,12,13,14],ymm1[15]
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm11
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,0,3]
-; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm8
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3],xmm8[4,5],xmm9[6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm1[3,4,5],ymm8[6,7]
-; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm1
-; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm10
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7]
-; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm9
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,8,9,u,u,16,17,20,21,u,u,22,23,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4,5,6],ymm1[7],ymm6[8,9],ymm1[10],ymm6[11,12,13,14],ymm1[15]
+; AVX2-FAST-NEXT: vpblendd $109, (%rsp), %ymm11, %ymm8 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5,6],ymm11[7]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm12
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[2,1,0,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm12
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm13
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3],xmm13[4,5],xmm12[6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5],ymm12[6,7]
+; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm3
+; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm12
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[2,1,2,1,6,5,6,5]
+; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm1
+; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,3,2,3,4,7,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,8,9,u,u,16,17,20,21,u,u,22,23,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1],ymm1[2],ymm12[3,4,5,6],ymm1[7],ymm12[8,9],ymm1[10],ymm12[11,12,13,14],ymm1[15]
+; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm12 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6],mem[7]
+; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm15
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm10
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[2,1,0,3]
+; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm10
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1],xmm10[2],xmm15[3],xmm10[4,5],xmm15[6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3,4,5],ymm10[6,7]
+; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm6
+; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm1
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7]
+; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm10
+; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm15
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u]
+; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm6
; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3],xmm14[4,5],xmm2[6],xmm14[7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,u,u,18,19,22,23,u,u,22,23,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm14
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0,1],ymm4[2],ymm14[3,4,5,6],ymm4[7],ymm14[8,9],ymm4[10],ymm14[11,12,13,14],ymm4[15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
-; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm4
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm2
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm15[3,4,5],ymm2[6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3],xmm13[4,5],xmm6[6],xmm13[7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,u,u,18,19,22,23,u,u,22,23,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4,5,6],ymm11[7],ymm2[8,9],ymm11[10],ymm2[11,12,13,14],ymm11[15]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4,5],xmm12[6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5],ymm4[6,7]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
@@ -3177,169 +3142,167 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm0
+; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm0
; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,5,5,5]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
-; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm2
-; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm3
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
+; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm2
+; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm4
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm3
-; AVX2-FAST-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: # ymm4 = mem[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11,12,13,14],ymm4[15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5],ymm2[6,7]
+; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4
+; AVX2-FAST-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm5 = mem[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11,12,13,14],ymm5[15]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[0,3,2,3,4,7,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <4,5,u,u,u,u,u,u,u,u,0,1,12,13,u,u,20,21,u,u,u,u,u,u,u,u,16,17,28,29,u,u>
-; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm4
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,0,1,4,5,6,7,8,9,u,u,u,u,8,9,u,u,16,17,20,21,22,23,24,25,u,u,u,u,24,25>
-; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm6
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3,4],ymm4[5,6],ymm6[7],ymm4[8],ymm6[9,10,11,12],ymm4[13,14],ymm6[15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
+; AVX2-FAST-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm4 = mem[0,3,2,3,4,7,6,7]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <4,5,u,u,u,u,u,u,u,u,0,1,12,13,u,u,20,21,u,u,u,u,u,u,u,u,16,17,28,29,u,u>
+; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm5
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,0,1,4,5,6,7,8,9,u,u,u,u,8,9,u,u,16,17,20,21,22,23,24,25,u,u,u,u,24,25>
+; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm6
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3,4],ymm5[5,6],ymm6[7],ymm5[8],ymm6[9,10,11,12],ymm5[13,14],ymm6[15]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,1,0,3]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,1,2,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u>
-; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm6
-; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,5,6,4]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4],xmm6[5,6],xmm0[7]
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm5
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13>
-; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm2
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,1,2,0,4,5,6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,2,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm6
+; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,6,5,6,4]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm12[4],xmm6[5,6],xmm12[7]
+; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm12 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13>
+; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm0
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm3
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7],ymm5[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm0
+; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm10 = mem[0,3,2,3,4,7,6,7]
-; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm3
+; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm3
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3,4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9,10,11,12],ymm0[13,14],ymm3[15]
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: # ymm3 = ymm11[0,1],mem[2],ymm11[3,4],mem[5],ymm11[6,7]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3]
-; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm2
+; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,1,0,3]
+; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm7
; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,5,6,4]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,6,5,6,4]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4],xmm7[5,6],xmm11[7]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
-; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm2
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm2
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u>
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2
+; AVX2-FAST-NEXT: vpblendd $219, (%rsp), %ymm2, %ymm11 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm2
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
+; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm14
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[2,1,2,0,4,5,6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm14[1,2],xmm6[3],xmm14[4,5,6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3,4],xmm0[5,6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u>
+; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,2,3,6,7,6,7,10,11,u,u,u,u,10,11,u,u,18,19,22,23,22,23,26,27,u,u,u,u,26,27>
-; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm15
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3,4],ymm2[5,6],ymm15[7],ymm2[8],ymm15[9,10,11,12],ymm2[13,14],ymm15[15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u>
-; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8
-; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,5,6,5]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15>
-; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm1
-; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm2
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9,10,11,12],ymm1[13,14],ymm2[15]
-; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm2
+; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6],ymm4[7],ymm7[8],ymm4[9,10,11,12],ymm7[13,14],ymm4[15]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm8
+; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15>
+; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm13
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,1,4,5,6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2],xmm12[3],xmm13[4,5,6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3,4],xmm4[5,6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6
+; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm1
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4],ymm6[5,6],ymm1[7],ymm6[8],ymm1[9,10,11,12],ymm6[13,14],ymm1[15]
+; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5
; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
-; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm3
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5,6],ymm11[7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6],xmm3[7]
+; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,1,4,5,6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3],xmm2[4,5,6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5,6],mem[7]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
-; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm5
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm7
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm6
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3],xmm6[4],xmm8[5],xmm6[6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm5
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1]
+; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm0
-; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm4
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
-; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
-; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7]
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi)
-; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi)
-; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx)
-; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx)
-; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx)
-; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx)
-; AVX2-FAST-NEXT: vmovdqa %ymm13, 32(%r8)
-; AVX2-FAST-NEXT: vmovdqa %ymm8, (%r8)
-; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r9)
-; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9)
+; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm3
+; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm7
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rsi)
+; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FAST-NEXT: vmovaps %ymm7, (%rsi)
+; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rdx)
+; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FAST-NEXT: vmovaps %ymm7, (%rdx)
+; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rcx)
+; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r8)
+; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8)
+; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%r9)
+; AVX2-FAST-NEXT: vmovdqa %ymm6, (%r9)
; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax)
+; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%rax)
; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax)
-; AVX2-FAST-NEXT: addq $568, %rsp # imm = 0x238
+; AVX2-FAST-NEXT: addq $552, %rsp # imm = 0x228
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
index 35ae596390d72..2ab6fcca1a4b2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
@@ -156,34 +156,34 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
; SSE-LABEL: load_i32_stride2_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm6
-; SSE-NEXT: movaps 16(%rdi), %xmm8
-; SSE-NEXT: movaps 32(%rdi), %xmm4
-; SSE-NEXT: movaps 48(%rdi), %xmm9
-; SSE-NEXT: movaps 80(%rdi), %xmm10
+; SSE-NEXT: movaps (%rdi), %xmm0
+; SSE-NEXT: movaps 16(%rdi), %xmm1
+; SSE-NEXT: movaps 32(%rdi), %xmm2
+; SSE-NEXT: movaps 48(%rdi), %xmm3
+; SSE-NEXT: movaps 80(%rdi), %xmm4
; SSE-NEXT: movaps 64(%rdi), %xmm5
-; SSE-NEXT: movaps 112(%rdi), %xmm11
+; SSE-NEXT: movaps 112(%rdi), %xmm6
; SSE-NEXT: movaps 96(%rdi), %xmm7
-; SSE-NEXT: movaps %xmm7, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2]
-; SSE-NEXT: movaps %xmm5, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
-; SSE-NEXT: movaps %xmm4, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm9[0,2]
-; SSE-NEXT: movaps %xmm6, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm11[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm10[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm9[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm8[1,3]
-; SSE-NEXT: movaps %xmm3, 32(%rsi)
-; SSE-NEXT: movaps %xmm0, (%rsi)
-; SSE-NEXT: movaps %xmm1, 48(%rsi)
-; SSE-NEXT: movaps %xmm2, 16(%rsi)
+; SSE-NEXT: movaps %xmm7, %xmm8
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[0,2]
+; SSE-NEXT: movaps %xmm5, %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm4[0,2]
+; SSE-NEXT: movaps %xmm2, %xmm10
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm3[0,2]
+; SSE-NEXT: movaps %xmm0, %xmm11
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm6[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm4[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE-NEXT: movaps %xmm9, 32(%rsi)
+; SSE-NEXT: movaps %xmm11, (%rsi)
+; SSE-NEXT: movaps %xmm8, 48(%rsi)
+; SSE-NEXT: movaps %xmm10, 16(%rsi)
; SSE-NEXT: movaps %xmm5, 32(%rdx)
-; SSE-NEXT: movaps %xmm6, (%rdx)
+; SSE-NEXT: movaps %xmm0, (%rdx)
; SSE-NEXT: movaps %xmm7, 48(%rdx)
-; SSE-NEXT: movaps %xmm4, 16(%rdx)
+; SSE-NEXT: movaps %xmm2, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i32_stride2_vf16:
@@ -252,8 +252,8 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
; SSE-LABEL: load_i32_stride2_vf32:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm9
-; SSE-NEXT: movaps 32(%rdi), %xmm14
+; SSE-NEXT: movaps (%rdi), %xmm0
+; SSE-NEXT: movaps 32(%rdi), %xmm1
; SSE-NEXT: movaps 48(%rdi), %xmm8
; SSE-NEXT: movaps 208(%rdi), %xmm10
; SSE-NEXT: movaps 192(%rdi), %xmm2
@@ -263,18 +263,18 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; SSE-NEXT: movaps 64(%rdi), %xmm6
; SSE-NEXT: movaps 240(%rdi), %xmm13
; SSE-NEXT: movaps 224(%rdi), %xmm4
-; SSE-NEXT: movaps 176(%rdi), %xmm15
+; SSE-NEXT: movaps 176(%rdi), %xmm14
; SSE-NEXT: movaps 160(%rdi), %xmm5
-; SSE-NEXT: movaps 112(%rdi), %xmm1
+; SSE-NEXT: movaps 112(%rdi), %xmm15
; SSE-NEXT: movaps 96(%rdi), %xmm7
-; SSE-NEXT: movaps %xmm7, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm1[1,3]
-; SSE-NEXT: movaps %xmm5, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm15[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm15[1,3]
-; SSE-NEXT: movaps %xmm4, %xmm15
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm13[0,2]
+; SSE-NEXT: movaps %xmm7, %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm15[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm15[1,3]
+; SSE-NEXT: movaps %xmm5, %xmm15
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm14[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm14[1,3]
+; SSE-NEXT: movaps %xmm4, %xmm14
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm13[1,3]
; SSE-NEXT: movaps %xmm2, %xmm13
; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm10[0,2]
@@ -285,23 +285,23 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; SSE-NEXT: movaps %xmm6, %xmm11
; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm12[1,3]
-; SSE-NEXT: movaps %xmm14, %xmm12
+; SSE-NEXT: movaps %xmm1, %xmm12
; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm8[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,3],xmm8[1,3]
-; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm8[1,3]
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 16(%rdi), %xmm8
-; SSE-NEXT: movaps %xmm9, %xmm14
-; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm8[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm8[1,3]
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm8[1,3]
; SSE-NEXT: movaps %xmm13, 96(%rsi)
; SSE-NEXT: movaps %xmm10, 64(%rsi)
; SSE-NEXT: movaps %xmm11, 32(%rsi)
-; SSE-NEXT: movaps %xmm14, (%rsi)
-; SSE-NEXT: movaps %xmm15, 112(%rsi)
-; SSE-NEXT: movaps %xmm1, 80(%rsi)
-; SSE-NEXT: movaps %xmm0, 48(%rsi)
+; SSE-NEXT: movaps %xmm1, (%rsi)
+; SSE-NEXT: movaps %xmm14, 112(%rsi)
+; SSE-NEXT: movaps %xmm15, 80(%rsi)
+; SSE-NEXT: movaps %xmm9, 48(%rsi)
; SSE-NEXT: movaps %xmm12, 16(%rsi)
-; SSE-NEXT: movaps %xmm9, (%rdx)
+; SSE-NEXT: movaps %xmm0, (%rdx)
; SSE-NEXT: movaps %xmm6, 32(%rdx)
; SSE-NEXT: movaps %xmm3, 64(%rdx)
; SSE-NEXT: movaps %xmm2, 96(%rdx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 1a4c92467ea44..8a5bedc8bb110 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -170,40 +170,40 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
; SSE-LABEL: load_i32_stride3_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movaps 80(%rdi), %xmm8
-; SSE-NEXT: movaps 64(%rdi), %xmm3
+; SSE-NEXT: movaps 80(%rdi), %xmm0
+; SSE-NEXT: movaps 64(%rdi), %xmm4
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movaps 16(%rdi), %xmm6
-; SSE-NEXT: movaps 32(%rdi), %xmm10
+; SSE-NEXT: movaps 32(%rdi), %xmm3
; SSE-NEXT: movdqa 48(%rdi), %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm11
+; SSE-NEXT: movdqa %xmm1, %xmm5
; SSE-NEXT: movaps %xmm6, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0]
; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[1,0]
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm6[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[1,0]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,2]
; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: movaps %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0]
-; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[1,0]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm3[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm8[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm10[2,3]
+; SSE-NEXT: movaps %xmm4, %xmm10
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[1,0]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm3[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[0,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,3]
; SSE-NEXT: movaps %xmm6, 16(%rsi)
-; SSE-NEXT: movaps %xmm11, (%rsi)
+; SSE-NEXT: movaps %xmm5, (%rsi)
; SSE-NEXT: movaps %xmm2, 16(%rdx)
; SSE-NEXT: movaps %xmm1, (%rdx)
-; SSE-NEXT: movaps %xmm5, 16(%rcx)
-; SSE-NEXT: movaps %xmm0, (%rcx)
+; SSE-NEXT: movaps %xmm11, 16(%rcx)
+; SSE-NEXT: movaps %xmm8, (%rcx)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i32_stride3_vf8:
@@ -358,94 +358,93 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
; SSE-LABEL: load_i32_stride3_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movaps 96(%rdi), %xmm14
-; SSE-NEXT: movaps 128(%rdi), %xmm11
-; SSE-NEXT: movaps 112(%rdi), %xmm12
-; SSE-NEXT: movaps 144(%rdi), %xmm3
-; SSE-NEXT: movaps 176(%rdi), %xmm13
-; SSE-NEXT: movaps 160(%rdi), %xmm5
-; SSE-NEXT: movaps (%rdi), %xmm15
-; SSE-NEXT: movaps 16(%rdi), %xmm8
-; SSE-NEXT: movaps 32(%rdi), %xmm6
-; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 48(%rdi), %xmm10
-; SSE-NEXT: movaps 80(%rdi), %xmm9
+; SSE-NEXT: movaps 96(%rdi), %xmm15
+; SSE-NEXT: movaps 128(%rdi), %xmm8
+; SSE-NEXT: movaps 112(%rdi), %xmm9
+; SSE-NEXT: movaps 144(%rdi), %xmm1
+; SSE-NEXT: movaps 176(%rdi), %xmm7
+; SSE-NEXT: movaps 160(%rdi), %xmm11
+; SSE-NEXT: movaps (%rdi), %xmm10
+; SSE-NEXT: movaps 16(%rdi), %xmm3
+; SSE-NEXT: movaps 32(%rdi), %xmm12
+; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 48(%rdi), %xmm4
+; SSE-NEXT: movaps 80(%rdi), %xmm14
; SSE-NEXT: movaps 64(%rdi), %xmm2
; SSE-NEXT: movaps %xmm2, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[1,0]
-; SSE-NEXT: movaps %xmm10, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,2]
-; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm5, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[1,0]
-; SSE-NEXT: movaps %xmm3, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm8, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0]
-; SSE-NEXT: movaps %xmm15, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm12, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0]
-; SSE-NEXT: movaps %xmm14, %xmm1
-; SSE-NEXT: movaps %xmm14, %xmm7
-; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0]
+; SSE-NEXT: movaps %xmm4, %xmm6
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[0,2]
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm11, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[1,0]
+; SSE-NEXT: movaps %xmm1, %xmm6
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[0,2]
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm3, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0]
; SSE-NEXT: movaps %xmm10, %xmm6
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[0,2]
+; SSE-NEXT: movaps %xmm9, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[1,0]
+; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm15, %xmm13
+; SSE-NEXT: movaps %xmm15, %xmm5
+; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm0[0,2]
+; SSE-NEXT: movaps %xmm4, %xmm12
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm2[0,0]
; SSE-NEXT: movaps %xmm2, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm9[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2]
-; SSE-NEXT: movaps %xmm3, %xmm4
-; SSE-NEXT: movaps %xmm3, %xmm14
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[0,0]
-; SSE-NEXT: movaps %xmm5, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2]
-; SSE-NEXT: movaps %xmm7, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0]
-; SSE-NEXT: movaps %xmm12, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm11[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2]
+; SSE-NEXT: movaps %xmm1, %xmm15
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm11[0,0]
+; SSE-NEXT: movaps %xmm11, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[0,2]
+; SSE-NEXT: movaps %xmm5, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[0,0]
+; SSE-NEXT: movaps %xmm9, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm8[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm8[0,0]
-; SSE-NEXT: movaps %xmm8, %xmm3
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm7[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm3[0,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm13[0,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm3[0,0]
+; SSE-NEXT: movaps %xmm3, %xmm8
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm5[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,1,1]
+; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: # xmm8 = mem[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[0,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1]
-; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = mem[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[0,3]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, 32(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, (%rsi)
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1]
+; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT: # xmm4 = mem[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3]
+; SSE-NEXT: movaps %xmm13, 32(%rsi)
+; SSE-NEXT: movaps %xmm6, (%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; SSE-NEXT: movaps %xmm3, 48(%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; SSE-NEXT: movaps %xmm3, 16(%rsi)
; SSE-NEXT: movaps %xmm1, 32(%rdx)
-; SSE-NEXT: movaps %xmm15, (%rdx)
-; SSE-NEXT: movaps %xmm4, 48(%rdx)
-; SSE-NEXT: movaps %xmm6, 16(%rdx)
-; SSE-NEXT: movaps %xmm7, 32(%rcx)
+; SSE-NEXT: movaps %xmm10, (%rdx)
+; SSE-NEXT: movaps %xmm15, 48(%rdx)
+; SSE-NEXT: movaps %xmm12, 16(%rdx)
+; SSE-NEXT: movaps %xmm4, 32(%rcx)
; SSE-NEXT: movaps %xmm0, (%rcx)
-; SSE-NEXT: movaps %xmm5, 48(%rcx)
+; SSE-NEXT: movaps %xmm8, 48(%rcx)
; SSE-NEXT: movaps %xmm2, 16(%rcx)
; SSE-NEXT: retq
;
@@ -691,62 +690,61 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-LABEL: load_i32_stride3_vf32:
; SSE: # %bb.0:
; SSE-NEXT: subq $344, %rsp # imm = 0x158
-; SSE-NEXT: movaps 336(%rdi), %xmm1
-; SSE-NEXT: movaps 368(%rdi), %xmm9
-; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 336(%rdi), %xmm10
+; SSE-NEXT: movaps 368(%rdi), %xmm1
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 352(%rdi), %xmm14
; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 240(%rdi), %xmm15
-; SSE-NEXT: movaps 272(%rdi), %xmm13
-; SSE-NEXT: movaps 256(%rdi), %xmm7
-; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 144(%rdi), %xmm3
-; SSE-NEXT: movaps 176(%rdi), %xmm12
-; SSE-NEXT: movaps 160(%rdi), %xmm10
-; SSE-NEXT: movaps 48(%rdi), %xmm5
-; SSE-NEXT: movaps 80(%rdi), %xmm6
-; SSE-NEXT: movaps 64(%rdi), %xmm8
-; SSE-NEXT: movaps %xmm8, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0]
-; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm5, %xmm2
-; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
-; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm10, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0]
-; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 240(%rdi), %xmm6
+; SSE-NEXT: movaps 272(%rdi), %xmm5
+; SSE-NEXT: movaps 256(%rdi), %xmm11
+; SSE-NEXT: movaps 144(%rdi), %xmm15
+; SSE-NEXT: movaps 176(%rdi), %xmm8
+; SSE-NEXT: movaps 160(%rdi), %xmm13
+; SSE-NEXT: movaps 48(%rdi), %xmm3
+; SSE-NEXT: movaps 80(%rdi), %xmm4
+; SSE-NEXT: movaps 64(%rdi), %xmm7
+; SSE-NEXT: movaps %xmm7, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0]
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm3, %xmm2
; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm7, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[1,0]
-; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm13, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[1,0]
; SSE-NEXT: movaps %xmm15, %xmm2
-; SSE-NEXT: movaps %xmm15, %xmm4
; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm11, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0]
+; SSE-NEXT: movaps %xmm5, %xmm12
+; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm6, %xmm2
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm14, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[1,0]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm1, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
+; SSE-NEXT: movaps %xmm10, %xmm1
+; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm10, %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 16(%rdi), %xmm7
+; SSE-NEXT: movaps 16(%rdi), %xmm5
; SSE-NEXT: movaps 32(%rdi), %xmm2
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm7, %xmm0
+; SSE-NEXT: movaps %xmm5, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0]
-; SSE-NEXT: movaps (%rdi), %xmm11
-; SSE-NEXT: movaps %xmm11, %xmm2
+; SSE-NEXT: movaps (%rdi), %xmm9
+; SSE-NEXT: movaps %xmm9, %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 128(%rdi), %xmm2
-; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
-; SSE-NEXT: movaps 112(%rdi), %xmm14
-; SSE-NEXT: movaps %xmm14, %xmm0
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 112(%rdi), %xmm10
+; SSE-NEXT: movaps %xmm10, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0]
; SSE-NEXT: movaps 96(%rdi), %xmm2
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -754,151 +752,150 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 224(%rdi), %xmm2
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 208(%rdi), %xmm9
-; SSE-NEXT: movaps %xmm9, %xmm0
+; SSE-NEXT: movaps 208(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0]
; SSE-NEXT: movaps 192(%rdi), %xmm2
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 320(%rdi), %xmm15
-; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 320(%rdi), %xmm14
; SSE-NEXT: movaps 304(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[1,0]
+; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0]
+; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 288(%rdi), %xmm2
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm5, %xmm2
-; SSE-NEXT: movaps %xmm8, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0]
-; SSE-NEXT: movaps %xmm8, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm6[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2]
-; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm10[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm12[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm7[0,0]
+; SSE-NEXT: movaps %xmm7, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm4, %xmm15
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm0[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3]
+; SSE-NEXT: movaps %xmm13, %xmm0
+; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm13[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm8[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[0,2]
+; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm6, %xmm13
+; SSE-NEXT: movaps %xmm11, %xmm0
+; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm11[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm1, %xmm12
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0]
; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, %xmm11
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm10[0,0]
+; SSE-NEXT: movaps %xmm10, %xmm0
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2]
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movaps %xmm6, %xmm10
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm14[0,0]
-; SSE-NEXT: movaps %xmm14, %xmm0
-; SSE-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm8[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
-; SSE-NEXT: movaps %xmm9, %xmm0
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0]
; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
-; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm7[0,0]
-; SSE-NEXT: movaps %xmm7, %xmm4
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm13[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm4[0,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
-; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; SSE-NEXT: # xmm5 = mem[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm14[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm5[0,0]
+; SSE-NEXT: movaps %xmm5, %xmm4
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm14[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm4[0,2]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1]
+; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE-NEXT: # xmm7 = mem[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0,3]
-; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: # xmm4 = mem[1,1,1,1]
-; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT: # xmm13 = mem[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
-; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
-; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm8[0,3]
-; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = mem[1,1,1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3]
+; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; SSE-NEXT: # xmm5 = mem[1,1,1,1]
; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
; SSE-NEXT: # xmm4 = mem[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,1,1]
-; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; SSE-NEXT: # xmm9 = mem[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; SSE-NEXT: # xmm9 = xmm9[0,1],mem[0,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[0,3]
; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
; SSE-NEXT: # xmm8 = mem[1,1,1,1]
-; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = mem[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3]
+; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; SSE-NEXT: # xmm14 = mem[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
+; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; SSE-NEXT: # xmm14 = xmm14[0,1],mem[0,3]
; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
; SSE-NEXT: # xmm8 = mem[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT: # xmm10 = mem[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT: # xmm10 = xmm10[0,1],mem[0,3]
+; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: # xmm8 = mem[1,1,1,1]
+; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT: # xmm1 = mem[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
+; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: # xmm0 = mem[2,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movaps %xmm6, 96(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movaps %xmm6, 64(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movaps %xmm6, 32(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movaps %xmm6, (%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movaps %xmm6, 112(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movaps %xmm6, 80(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movaps %xmm6, 48(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movaps %xmm6, 16(%rsi)
-; SSE-NEXT: movaps %xmm1, 96(%rdx)
-; SSE-NEXT: movaps %xmm3, 64(%rdx)
-; SSE-NEXT: movaps %xmm10, 32(%rdx)
-; SSE-NEXT: movaps %xmm11, (%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: movaps %xmm8, 96(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: movaps %xmm8, 64(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: movaps %xmm8, 32(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: movaps %xmm8, (%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: movaps %xmm8, 112(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: movaps %xmm8, 80(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: movaps %xmm8, 48(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: movaps %xmm8, 16(%rsi)
+; SSE-NEXT: movaps %xmm3, 96(%rdx)
+; SSE-NEXT: movaps %xmm6, 64(%rdx)
+; SSE-NEXT: movaps %xmm11, 32(%rdx)
+; SSE-NEXT: movaps %xmm9, (%rdx)
; SSE-NEXT: movaps %xmm12, 112(%rdx)
-; SSE-NEXT: movaps %xmm15, 80(%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, 48(%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, 16(%rdx)
+; SSE-NEXT: movaps %xmm13, 80(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: movaps %xmm3, 48(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: movaps %xmm3, 16(%rdx)
; SSE-NEXT: movaps %xmm0, 96(%rcx)
-; SSE-NEXT: movaps %xmm7, 112(%rcx)
-; SSE-NEXT: movaps %xmm9, 64(%rcx)
-; SSE-NEXT: movaps %xmm4, 80(%rcx)
-; SSE-NEXT: movaps %xmm14, 32(%rcx)
-; SSE-NEXT: movaps %xmm13, 48(%rcx)
+; SSE-NEXT: movaps %xmm1, 112(%rcx)
+; SSE-NEXT: movaps %xmm10, 64(%rcx)
+; SSE-NEXT: movaps %xmm14, 80(%rcx)
+; SSE-NEXT: movaps %xmm5, 32(%rcx)
+; SSE-NEXT: movaps %xmm4, 48(%rcx)
; SSE-NEXT: movaps %xmm2, (%rcx)
-; SSE-NEXT: movaps %xmm5, 16(%rcx)
+; SSE-NEXT: movaps %xmm7, 16(%rcx)
; SSE-NEXT: addq $344, %rsp # imm = 0x158
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index cd49924d7c563..7f19e0ce50ff3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -196,97 +196,97 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
; SSE-LABEL: load_i32_stride4_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm5
-; SSE-NEXT: movaps 16(%rdi), %xmm8
-; SSE-NEXT: movaps 32(%rdi), %xmm6
-; SSE-NEXT: movaps 48(%rdi), %xmm9
-; SSE-NEXT: movaps 80(%rdi), %xmm10
-; SSE-NEXT: movaps 64(%rdi), %xmm4
-; SSE-NEXT: movaps 112(%rdi), %xmm11
-; SSE-NEXT: movaps 96(%rdi), %xmm3
-; SSE-NEXT: movaps %xmm3, %xmm7
-; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
-; SSE-NEXT: movaps %xmm4, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
-; SSE-NEXT: movaps %xmm1, %xmm12
-; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm7[0]
-; SSE-NEXT: movaps %xmm6, %xmm2
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
-; SSE-NEXT: movaps %xmm5, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
-; SSE-NEXT: movaps %xmm0, %xmm13
-; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0]
+; SSE-NEXT: movaps (%rdi), %xmm0
+; SSE-NEXT: movaps 16(%rdi), %xmm3
+; SSE-NEXT: movaps 32(%rdi), %xmm2
+; SSE-NEXT: movaps 48(%rdi), %xmm4
+; SSE-NEXT: movaps 80(%rdi), %xmm5
+; SSE-NEXT: movaps 64(%rdi), %xmm1
+; SSE-NEXT: movaps 112(%rdi), %xmm6
+; SSE-NEXT: movaps 96(%rdi), %xmm7
+; SSE-NEXT: movaps %xmm7, %xmm8
+; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; SSE-NEXT: movaps %xmm1, %xmm9
+; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
+; SSE-NEXT: movaps %xmm9, %xmm10
+; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0]
+; SSE-NEXT: movaps %xmm2, %xmm11
+; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
+; SSE-NEXT: movaps %xmm0, %xmm12
+; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
+; SSE-NEXT: movaps %xmm12, %xmm13
+; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1]
+; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE-NEXT: movaps %xmm1, %xmm5
+; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE-NEXT: movaps %xmm0, %xmm3
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
-; SSE-NEXT: movaps %xmm4, %xmm2
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
-; SSE-NEXT: movaps %xmm5, %xmm7
-; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; SSE-NEXT: movaps %xmm12, 16(%rsi)
+; SSE-NEXT: movaps %xmm10, 16(%rsi)
; SSE-NEXT: movaps %xmm13, (%rsi)
-; SSE-NEXT: movaps %xmm1, 16(%rdx)
-; SSE-NEXT: movaps %xmm0, (%rdx)
-; SSE-NEXT: movaps %xmm2, 16(%rcx)
-; SSE-NEXT: movaps %xmm7, (%rcx)
-; SSE-NEXT: movaps %xmm4, 16(%r8)
-; SSE-NEXT: movaps %xmm5, (%r8)
+; SSE-NEXT: movaps %xmm9, 16(%rdx)
+; SSE-NEXT: movaps %xmm12, (%rdx)
+; SSE-NEXT: movaps %xmm5, 16(%rcx)
+; SSE-NEXT: movaps %xmm3, (%rcx)
+; SSE-NEXT: movaps %xmm1, 16(%r8)
+; SSE-NEXT: movaps %xmm0, (%r8)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i32_stride4_vf8:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps 64(%rdi), %ymm0
; AVX1-NEXT: vmovaps 96(%rdi), %ymm1
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1]
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm10[0],ymm1[2],ymm10[2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
; AVX1-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5]
-; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4]
; AVX1-NEXT: vmovaps 32(%rdi), %xmm5
; AVX1-NEXT: vmovaps 48(%rdi), %xmm6
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm6[0],xmm5[0]
-; AVX1-NEXT: vmovaps (%rdi), %xmm3
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm7
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[4],ymm1[4],ymm10[5],ymm1[5]
-; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,0],ymm4[1,0],ymm0[5,4],ymm4[5,4]
-; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm9[2,0],ymm8[2,3],ymm9[6,4],ymm8[6,7]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[1],xmm7[1],zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm10[1],ymm1[3],ymm10[3]
-; AVX1-NEXT: vunpckhps {{.*#+}} ymm9 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm5[2],xmm6[2]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0]
+; AVX1-NEXT: vmovaps (%rdi), %xmm8
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm9
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm4[1,0],ymm0[5,4],ymm4[5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,0],ymm7[2,3],ymm10[6,4],ymm7[6,7]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm11 = xmm8[1],xmm9[1],zero,zero
+; AVX1-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm11 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm11 = zero,zero,xmm5[2],xmm6[2]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX1-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[3,0],ymm0[7,4],ymm4[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7]
; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm3[3,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,0],xmm1[2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm9[3,0],xmm8[3,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vmovaps %ymm11, (%rsi)
-; AVX1-NEXT: vmovaps %ymm12, (%rdx)
-; AVX1-NEXT: vmovaps %ymm2, (%rcx)
+; AVX1-NEXT: vmovaps %ymm3, (%rsi)
+; AVX1-NEXT: vmovaps %ymm7, (%rdx)
+; AVX1-NEXT: vmovaps %ymm10, (%rcx)
; AVX1-NEXT: vmovaps %ymm0, (%r8)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_i32_stride4_vf8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps (%rdi), %ymm8
-; AVX2-NEXT: vmovaps 32(%rdi), %ymm9
+; AVX2-NEXT: vmovaps (%rdi), %ymm0
+; AVX2-NEXT: vmovaps 32(%rdi), %ymm4
; AVX2-NEXT: vmovaps 64(%rdi), %ymm1
; AVX2-NEXT: vmovaps 96(%rdi), %ymm2
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [17179869184,17179869184,17179869184,17179869184]
@@ -294,45 +294,45 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm3
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-NEXT: vmovaps {{.*#+}} xmm5 = <u,u,0,4>
-; AVX2-NEXT: vpermps %ymm9, %ymm5, %ymm6
+; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm6
; AVX2-NEXT: vmovaps (%rdi), %xmm7
-; AVX2-NEXT: vmovaps 16(%rdi), %xmm0
+; AVX2-NEXT: vmovaps 16(%rdi), %xmm8
; AVX2-NEXT: vmovaps 32(%rdi), %xmm5
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [21474836481,21474836481,21474836481,21474836481]
-; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm6
-; AVX2-NEXT: vpermps %ymm1, %ymm4, %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-NEXT: vmovaps 48(%rdi), %xmm6
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836481,21474836481,21474836481,21474836481]
+; AVX2-NEXT: vpermps %ymm2, %ymm6, %ymm9
+; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm6
+; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-NEXT: vmovaps 48(%rdi), %xmm9
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
; AVX2-NEXT: vmovaps {{.*#+}} xmm11 = <1,5,u,u>
-; AVX2-NEXT: vpermps %ymm8, %ymm11, %ymm11
-; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [25769803778,25769803778,25769803778,25769803778]
-; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm11
-; AVX2-NEXT: vpermps %ymm1, %ymm4, %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-NEXT: vpermps %ymm0, %ymm11, %ymm11
+; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm10 = [25769803778,25769803778,25769803778,25769803778]
+; AVX2-NEXT: vpermps %ymm2, %ymm10, %ymm11
+; AVX2-NEXT: vpermps %ymm1, %ymm10, %ymm10
+; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
; AVX2-NEXT: vmovaps {{.*#+}} xmm11 = <u,u,2,6>
-; AVX2-NEXT: vpermps %ymm9, %ymm11, %ymm9
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [30064771075,30064771075,30064771075,30064771075]
-; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpermps %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpermps %ymm4, %ymm11, %ymm4
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm7 = [30064771075,30064771075,30064771075,30064771075]
+; AVX2-NEXT: vpermps %ymm2, %ymm7, %ymm2
+; AVX2-NEXT: vpermps %ymm1, %ymm7, %ymm1
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-NEXT: vmovaps {{.*#+}} xmm4 = <3,7,u,u>
-; AVX2-NEXT: vpermps %ymm8, %ymm4, %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovaps %ymm10, (%rsi)
-; AVX2-NEXT: vmovaps %ymm3, (%rdx)
-; AVX2-NEXT: vmovaps %ymm0, (%rcx)
-; AVX2-NEXT: vmovaps %ymm1, (%r8)
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
+; AVX2-NEXT: vmovaps {{.*#+}} xmm5 = <3,7,u,u>
+; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vmovaps %ymm3, (%rsi)
+; AVX2-NEXT: vmovaps %ymm6, (%rdx)
+; AVX2-NEXT: vmovaps %ymm4, (%rcx)
+; AVX2-NEXT: vmovaps %ymm0, (%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -372,223 +372,230 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
; SSE-LABEL: load_i32_stride4_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: subq $24, %rsp
-; SSE-NEXT: movaps 208(%rdi), %xmm10
+; SSE-NEXT: subq $40, %rsp
+; SSE-NEXT: movaps 208(%rdi), %xmm8
+; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 240(%rdi), %xmm10
; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 240(%rdi), %xmm5
-; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 224(%rdi), %xmm2
-; SSE-NEXT: movaps 80(%rdi), %xmm3
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 64(%rdi), %xmm9
+; SSE-NEXT: movaps 80(%rdi), %xmm4
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 64(%rdi), %xmm5
; SSE-NEXT: movaps 112(%rdi), %xmm6
; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 96(%rdi), %xmm7
-; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 96(%rdi), %xmm3
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 144(%rdi), %xmm1
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 128(%rdi), %xmm12
-; SSE-NEXT: movaps 176(%rdi), %xmm4
-; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 160(%rdi), %xmm8
-; SSE-NEXT: movaps %xmm8, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE-NEXT: movaps %xmm12, %xmm11
-; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
-; SSE-NEXT: movaps %xmm11, %xmm1
+; SSE-NEXT: movaps 128(%rdi), %xmm9
+; SSE-NEXT: movaps 176(%rdi), %xmm11
+; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 160(%rdi), %xmm7
+; SSE-NEXT: movaps %xmm7, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
+; SSE-NEXT: movaps %xmm9, %xmm15
+; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
+; SSE-NEXT: movaps %xmm15, %xmm1
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
-; SSE-NEXT: movaps %xmm7, %xmm1
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm3, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; SSE-NEXT: movaps %xmm9, %xmm7
-; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
-; SSE-NEXT: movaps %xmm7, %xmm0
+; SSE-NEXT: movaps %xmm5, %xmm6
+; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
+; SSE-NEXT: movaps %xmm6, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1]
+; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1]
; SSE-NEXT: movaps %xmm2, %xmm0
-; SSE-NEXT: movaps %xmm2, %xmm6
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; SSE-NEXT: movaps 192(%rdi), %xmm5
-; SSE-NEXT: movaps %xmm5, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
-; SSE-NEXT: movaps %xmm1, %xmm15
-; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0]
+; SSE-NEXT: movaps %xmm2, %xmm4
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
+; SSE-NEXT: movaps 192(%rdi), %xmm12
+; SSE-NEXT: movaps %xmm12, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
+; SSE-NEXT: movaps %xmm1, %xmm2
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: movaps 32(%rdi), %xmm4
-; SSE-NEXT: movaps 48(%rdi), %xmm13
-; SSE-NEXT: movaps %xmm4, %xmm3
-; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1]
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: movaps 16(%rdi), %xmm10
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
-; SSE-NEXT: movaps %xmm2, %xmm14
-; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
+; SSE-NEXT: movaps 32(%rdi), %xmm3
+; SSE-NEXT: movaps 48(%rdi), %xmm14
+; SSE-NEXT: movaps %xmm3, %xmm2
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
+; SSE-NEXT: movaps (%rdi), %xmm10
+; SSE-NEXT: movaps 16(%rdi), %xmm8
+; SSE-NEXT: movaps %xmm10, %xmm13
+; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1]
+; SSE-NEXT: movaps %xmm13, %xmm11
+; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1]
+; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
+; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
+; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
-; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
+; SSE-NEXT: movaps %xmm9, %xmm2
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1]
+; SSE-NEXT: movaps %xmm5, %xmm7
+; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
+; SSE-NEXT: movaps %xmm12, %xmm8
+; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1]
+; SSE-NEXT: movaps %xmm10, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1]
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
-; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; SSE-NEXT: movaps %xmm12, %xmm10
-; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm8[1]
-; SSE-NEXT: movaps %xmm9, %xmm8
-; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm3[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1]
-; SSE-NEXT: movaps %xmm5, %xmm3
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; SSE-NEXT: movaps %xmm0, %xmm13
-; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm4[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE-NEXT: movaps %xmm15, 48(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: movaps %xmm4, 16(%rsi)
-; SSE-NEXT: movaps (%rsp), %xmm4 # 16-byte Reload
-; SSE-NEXT: movaps %xmm4, 32(%rsi)
-; SSE-NEXT: movaps %xmm14, (%rsi)
+; SSE-NEXT: movaps %xmm3, 48(%rsi)
+; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload
+; SSE-NEXT: movaps %xmm3, 16(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: movaps %xmm3, 32(%rsi)
+; SSE-NEXT: movaps %xmm11, (%rsi)
; SSE-NEXT: movaps %xmm1, 48(%rdx)
-; SSE-NEXT: movaps %xmm7, 16(%rdx)
-; SSE-NEXT: movaps %xmm2, (%rdx)
-; SSE-NEXT: movaps %xmm11, 32(%rdx)
-; SSE-NEXT: movaps %xmm8, 16(%rcx)
-; SSE-NEXT: movaps %xmm3, 48(%rcx)
-; SSE-NEXT: movaps %xmm10, 32(%rcx)
-; SSE-NEXT: movaps %xmm13, (%rcx)
-; SSE-NEXT: movaps %xmm5, 48(%r8)
-; SSE-NEXT: movaps %xmm9, 16(%r8)
-; SSE-NEXT: movaps %xmm12, 32(%r8)
-; SSE-NEXT: movaps %xmm0, (%r8)
-; SSE-NEXT: addq $24, %rsp
+; SSE-NEXT: movaps %xmm6, 16(%rdx)
+; SSE-NEXT: movaps %xmm13, (%rdx)
+; SSE-NEXT: movaps %xmm15, 32(%rdx)
+; SSE-NEXT: movaps %xmm7, 16(%rcx)
+; SSE-NEXT: movaps %xmm8, 48(%rcx)
+; SSE-NEXT: movaps %xmm2, 32(%rcx)
+; SSE-NEXT: movaps %xmm0, (%rcx)
+; SSE-NEXT: movaps %xmm12, 48(%r8)
+; SSE-NEXT: movaps %xmm5, 16(%r8)
+; SSE-NEXT: movaps %xmm9, 32(%r8)
+; SSE-NEXT: movaps %xmm10, (%r8)
+; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i32_stride4_vf16:
; AVX1: # %bb.0:
-; AVX1-NEXT: subq $312, %rsp # imm = 0x138
+; AVX1-NEXT: subq $264, %rsp # imm = 0x108
; AVX1-NEXT: vmovaps 64(%rdi), %ymm5
-; AVX1-NEXT: vmovaps 96(%rdi), %ymm8
+; AVX1-NEXT: vmovaps 96(%rdi), %ymm4
; AVX1-NEXT: vmovaps 192(%rdi), %ymm2
-; AVX1-NEXT: vmovaps 224(%rdi), %ymm15
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3,0,1]
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm11[0],ymm15[2],ymm11[2]
-; AVX1-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[4],ymm2[4],ymm7[5],ymm2[5]
-; AVX1-NEXT: vmovaps %ymm2, %ymm4
-; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
+; AVX1-NEXT: vmovaps 224(%rdi), %ymm3
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX1-NEXT: vmovaps %ymm3, %ymm14
+; AVX1-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vmovaps %ymm1, %ymm15
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
+; AVX1-NEXT: vmovaps %ymm2, %ymm10
+; AVX1-NEXT: vmovaps %ymm1, %ymm3
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4]
; AVX1-NEXT: vmovaps 160(%rdi), %xmm1
; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 176(%rdi), %xmm0
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vmovaps 144(%rdi), %xmm0
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 128(%rdi), %xmm2
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,0,1]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1]
-; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5]
-; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
-; AVX1-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX1-NEXT: vmovaps 48(%rdi), %xmm0
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vmovaps %xmm1, %xmm14
-; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps (%rdi), %xmm12
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm6
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
+; AVX1-NEXT: vmovaps 176(%rdi), %xmm6
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm1[0]
+; AVX1-NEXT: vmovaps %xmm6, %xmm2
+; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps 144(%rdi), %xmm1
+; AVX1-NEXT: vmovaps 128(%rdi), %xmm6
; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm12 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[4],ymm15[4],ymm11[5],ymm15[5]
-; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm7[1,0],ymm4[5,4],ymm7[5,4]
-; AVX1-NEXT: vmovaps %ymm7, %ymm11
-; AVX1-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm4[1],zero,zero
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm13 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm4[2,3,0,1]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
+; AVX1-NEXT: vmovaps %ymm6, %ymm8
+; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm5[2,3,0,1]
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm13 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[4],ymm5[4],ymm9[5],ymm5[5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm0[2,0],ymm13[4,5],ymm0[6,4]
+; AVX1-NEXT: vmovaps 32(%rdi), %xmm11
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm12
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0]
+; AVX1-NEXT: vmovaps (%rdi), %xmm4
+; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm5
+; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[1],ymm8[1],ymm3[4],ymm8[4],ymm3[5],ymm8[5]
-; AVX1-NEXT: vmovaps %ymm5, %ymm2
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm1[1,0],ymm5[5,4],ymm1[5,4]
-; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
+; AVX1-NEXT: vmovaps %ymm15, %ymm4
+; AVX1-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm7 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[4],ymm14[4],ymm15[5],ymm14[5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,0],ymm3[1,0],ymm10[5,4],ymm3[5,4]
+; AVX1-NEXT: vmovaps %ymm10, %ymm15
+; AVX1-NEXT: vmovaps %ymm3, %ymm13
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm7[2,3],ymm6[6,4],ymm7[6,7]
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm12[1],xmm6[1],zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm13[1],ymm15[3],ymm13[3]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm14[2],ymm11[3],ymm14[3],ymm11[6],ymm14[6],ymm11[7],ymm14[7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,0],ymm6[4,5],ymm0[6,4]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; AVX1-NEXT: vmovaps %xmm9, %xmm10
-; AVX1-NEXT: vmovaps %xmm7, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm9[2]
-; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
-; AVX1-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX1-NEXT: vmovaps %ymm2, %ymm9
-; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[1],xmm1[1],zero,zero
+; AVX1-NEXT: vmovaps %xmm1, %xmm14
+; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm1[2],xmm5[2]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm8, %ymm10
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[4],ymm0[4],ymm8[5],ymm0[5]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,0],ymm9[1,0],ymm2[5,4],ymm9[5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm3[2,3],ymm6[6,4],ymm3[6,7]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm7 = mem[0],xmm8[1],zero,zero
+; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm14[2],xmm5[3],xmm14[3]
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm3[2],xmm12[3],xmm3[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm1[2],xmm3[2]
+; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovaps %ymm10, %ymm14
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
+; AVX1-NEXT: vmovaps %ymm0, %ymm5
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm7 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4]
+; AVX1-NEXT: vmovaps %xmm11, %xmm10
+; AVX1-NEXT: vmovaps %xmm12, %xmm11
+; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm10[2],xmm12[2]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm8[2],xmm12[3],xmm8[3]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7]
-; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm2 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4]
-; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm6[2,3],ymm2[6,4],ymm6[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm11[3,0]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX1-NEXT: vunpckhps (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0],ymm13[3,0],ymm15[7,4],ymm13[7,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm6 = xmm1[3,0],mem[3,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7]
-; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm6 = ymm9[3,0],mem[3,0],ymm9[7,4],mem[7,4]
-; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[2,3],ymm6[6,4],ymm4[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,0],xmm12[3,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,0],ymm9[3,0],ymm1[7,4],ymm9[7,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,0],xmm12[3,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm3, 32(%rsi)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -602,95 +609,95 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-NEXT: vmovaps %ymm0, (%rcx)
; AVX1-NEXT: vmovaps %ymm2, 32(%r8)
; AVX1-NEXT: vmovaps %ymm1, (%r8)
-; AVX1-NEXT: addq $312, %rsp # imm = 0x138
+; AVX1-NEXT: addq $264, %rsp # imm = 0x108
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_i32_stride4_vf16:
; AVX2: # %bb.0:
; AVX2-NEXT: subq $104, %rsp
-; AVX2-NEXT: vmovaps (%rdi), %ymm13
-; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 32(%rdi), %ymm9
-; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps (%rdi), %ymm8
+; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 32(%rdi), %ymm7
+; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps 64(%rdi), %ymm4
; AVX2-NEXT: vmovaps 96(%rdi), %ymm5
-; AVX2-NEXT: vmovaps 160(%rdi), %ymm8
-; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 160(%rdi), %ymm12
+; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps 192(%rdi), %ymm3
; AVX2-NEXT: vmovaps 224(%rdi), %ymm2
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [17179869184,17179869184,17179869184,17179869184]
; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm1
; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm6
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vmovaps 144(%rdi), %xmm15
-; AVX2-NEXT: vmovaps 128(%rdi), %xmm10
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm15[0],xmm10[1],xmm15[1]
-; AVX2-NEXT: vmovaps {{.*#+}} xmm7 = <u,u,0,4>
-; AVX2-NEXT: vpermps %ymm8, %ymm7, %ymm8
-; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3]
+; AVX2-NEXT: vmovaps 144(%rdi), %xmm10
+; AVX2-NEXT: vmovaps 128(%rdi), %xmm11
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX2-NEXT: vmovaps {{.*#+}} xmm9 = <u,u,0,4>
+; AVX2-NEXT: vpermps %ymm12, %ymm9, %ymm12
+; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm1
; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vpermps %ymm9, %ymm7, %ymm1
-; AVX2-NEXT: vmovaps (%rdi), %xmm7
-; AVX2-NEXT: vmovaps 16(%rdi), %xmm8
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX2-NEXT: vpermps %ymm7, %ymm9, %ymm1
+; AVX2-NEXT: vmovaps (%rdi), %xmm12
+; AVX2-NEXT: vmovaps 16(%rdi), %xmm13
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [21474836481,21474836481,21474836481,21474836481]
; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm1
; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm6
-; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vmovaps 32(%rdi), %xmm9
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vmovaps 32(%rdi), %xmm15
; AVX2-NEXT: vmovaps 48(%rdi), %xmm6
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; AVX2-NEXT: vmovaps {{.*#+}} xmm11 = <1,5,u,u>
-; AVX2-NEXT: vpermps %ymm13, %ymm11, %ymm14
-; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm15[0],xmm6[0],xmm15[1],xmm6[1]
+; AVX2-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u>
+; AVX2-NEXT: vpermps %ymm8, %ymm7, %ymm14
+; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm1
; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: vmovaps 128(%rdi), %ymm14
-; AVX2-NEXT: vpermps %ymm14, %ymm11, %ymm11
+; AVX2-NEXT: vpermps %ymm14, %ymm7, %ymm7
; AVX2-NEXT: vmovaps 176(%rdi), %xmm1
; AVX2-NEXT: vmovaps 160(%rdi), %xmm0
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm11 = [25769803778,25769803778,25769803778,25769803778]
-; AVX2-NEXT: vpermps %ymm2, %ymm11, %ymm13
-; AVX2-NEXT: vpermps %ymm3, %ymm11, %ymm12
-; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm10 = xmm10[2],xmm15[2],xmm10[3],xmm15[3]
-; AVX2-NEXT: vmovaps {{.*#+}} xmm13 = <u,u,2,6>
-; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-NEXT: vpermps %ymm5, %ymm11, %ymm12
-; AVX2-NEXT: vpermps %ymm4, %ymm11, %ymm11
-; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm12 # 32-byte Folded Reload
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm8 = [30064771075,30064771075,30064771075,30064771075]
-; AVX2-NEXT: vpermps %ymm5, %ymm8, %ymm5
-; AVX2-NEXT: vpermps %ymm4, %ymm8, %ymm4
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm7 = [25769803778,25769803778,25769803778,25769803778]
+; AVX2-NEXT: vpermps %ymm2, %ymm7, %ymm8
+; AVX2-NEXT: vpermps %ymm3, %ymm7, %ymm9
+; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX2-NEXT: vmovaps {{.*#+}} xmm10 = <u,u,2,6>
+; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload
+; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vpermps %ymm5, %ymm7, %ymm9
+; AVX2-NEXT: vpermps %ymm4, %ymm7, %ymm7
+; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm9 = [30064771075,30064771075,30064771075,30064771075]
+; AVX2-NEXT: vpermps %ymm5, %ymm9, %ymm5
+; AVX2-NEXT: vpermps %ymm4, %ymm9, %ymm4
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
; AVX2-NEXT: vmovaps {{.*#+}} xmm6 = <3,7,u,u>
-; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3]
+; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload
+; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT: vpermps %ymm2, %ymm8, %ymm2
-; AVX2-NEXT: vpermps %ymm3, %ymm8, %ymm3
+; AVX2-NEXT: vpermps %ymm2, %ymm9, %ymm2
+; AVX2-NEXT: vpermps %ymm3, %ymm9, %ymm3
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vpermps %ymm14, %ymm6, %ymm1
@@ -704,7 +711,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm1, (%rdx)
-; AVX2-NEXT: vmovaps %ymm10, 32(%rcx)
+; AVX2-NEXT: vmovaps %ymm8, 32(%rcx)
; AVX2-NEXT: vmovaps %ymm7, (%rcx)
; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
; AVX2-NEXT: vmovaps %ymm4, (%r8)
@@ -766,59 +773,60 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-LABEL: load_i32_stride4_vf32:
; SSE: # %bb.0:
; SSE-NEXT: subq $456, %rsp # imm = 0x1C8
-; SSE-NEXT: movaps 336(%rdi), %xmm8
-; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 368(%rdi), %xmm9
-; SSE-NEXT: movaps 352(%rdi), %xmm10
-; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 208(%rdi), %xmm7
-; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 192(%rdi), %xmm5
+; SSE-NEXT: movaps 336(%rdi), %xmm5
; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 240(%rdi), %xmm6
+; SSE-NEXT: movaps 368(%rdi), %xmm6
; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 224(%rdi), %xmm14
-; SSE-NEXT: movaps 80(%rdi), %xmm4
+; SSE-NEXT: movaps 352(%rdi), %xmm4
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 208(%rdi), %xmm7
+; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 192(%rdi), %xmm8
+; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 240(%rdi), %xmm9
+; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 224(%rdi), %xmm12
+; SSE-NEXT: movaps 80(%rdi), %xmm10
+; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 64(%rdi), %xmm1
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 112(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 96(%rdi), %xmm11
+; SSE-NEXT: movaps 112(%rdi), %xmm11
; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
+; SSE-NEXT: movaps 96(%rdi), %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
; SSE-NEXT: movaps %xmm1, %xmm2
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
; SSE-NEXT: movaps %xmm2, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm14, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; SSE-NEXT: movaps %xmm5, %xmm3
+; SSE-NEXT: movaps %xmm12, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
+; SSE-NEXT: movaps %xmm8, %xmm3
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm3, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm10, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
-; SSE-NEXT: movaps %xmm9, %xmm7
+; SSE-NEXT: movaps %xmm4, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
; SSE-NEXT: movaps 320(%rdi), %xmm1
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 496(%rdi), %xmm1
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 480(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movaps 496(%rdi), %xmm2
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 480(%rdi), %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: movaps %xmm1, %xmm13
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: movaps 464(%rdi), %xmm3
; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 448(%rdi), %xmm1
@@ -848,18 +856,18 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 304(%rdi), %xmm1
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 288(%rdi), %xmm13
-; SSE-NEXT: movaps %xmm13, %xmm0
+; SSE-NEXT: movaps 288(%rdi), %xmm7
+; SSE-NEXT: movaps %xmm7, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: movaps 272(%rdi), %xmm1
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 256(%rdi), %xmm15
-; SSE-NEXT: movaps %xmm15, %xmm12
-; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
-; SSE-NEXT: movaps %xmm12, %xmm1
+; SSE-NEXT: movaps %xmm15, %xmm11
+; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
+; SSE-NEXT: movaps %xmm11, %xmm1
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
; SSE-NEXT: movaps 432(%rdi), %xmm1
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 416(%rdi), %xmm10
@@ -894,33 +902,34 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
-; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
+; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
-; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
-; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
-; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
+; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
+; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm13, %xmm12
+; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
+; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
+; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
@@ -948,18 +957,16 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm14[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movaps %xmm6, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1]
-; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm7, %xmm14
-; SSE-NEXT: movaps %xmm7, %xmm11
-; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE-NEXT: movaps %xmm14, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm12[1]
+; SSE-NEXT: movaps %xmm13, %xmm12
+; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1]
; SSE-NEXT: movaps %xmm15, %xmm9
-; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm7[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm7[1]
; SSE-NEXT: movaps %xmm8, %xmm7
; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1]
@@ -983,7 +990,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 16(%rsi)
; SSE-NEXT: movaps %xmm3, 96(%rdx)
-; SSE-NEXT: movaps %xmm12, 64(%rdx)
+; SSE-NEXT: movaps %xmm11, 64(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 32(%rdx)
; SSE-NEXT: movaps %xmm4, (%rdx)
@@ -997,7 +1004,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movaps %xmm2, 16(%rdx)
; SSE-NEXT: movaps %xmm7, 96(%rcx)
; SSE-NEXT: movaps %xmm9, 64(%rcx)
-; SSE-NEXT: movaps %xmm11, 32(%rcx)
+; SSE-NEXT: movaps %xmm12, 32(%rcx)
; SSE-NEXT: movaps %xmm6, (%rcx)
; SSE-NEXT: movaps %xmm0, 112(%rcx)
; SSE-NEXT: movaps %xmm1, 80(%rcx)
@@ -1007,10 +1014,9 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movaps %xmm0, 16(%rcx)
; SSE-NEXT: movaps %xmm8, 96(%r8)
; SSE-NEXT: movaps %xmm15, 64(%r8)
-; SSE-NEXT: movaps %xmm14, 32(%r8)
+; SSE-NEXT: movaps %xmm13, 32(%r8)
; SSE-NEXT: movaps %xmm5, (%r8)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 112(%r8)
+; SSE-NEXT: movaps %xmm14, 112(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 80(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -1042,7 +1048,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vmovaps 400(%rdi), %xmm5
-; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill
; AVX1-NEXT: vmovaps 384(%rdi), %xmm2
; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
@@ -1054,229 +1060,227 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
; AVX1-NEXT: vmovaps %ymm1, %ymm6
; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovaps %ymm3, %ymm5
+; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1]
; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
-; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
; AVX1-NEXT: vmovaps 288(%rdi), %xmm2
; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps 304(%rdi), %xmm1
; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT: vmovaps 272(%rdi), %xmm3
-; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 256(%rdi), %xmm2
-; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX1-NEXT: vmovaps 272(%rdi), %xmm4
+; AVX1-NEXT: vmovaps 256(%rdi), %xmm5
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX1-NEXT: vmovaps %xmm5, %xmm8
+; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovaps 192(%rdi), %ymm0
-; AVX1-NEXT: vmovaps 224(%rdi), %ymm1
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
-; AVX1-NEXT: vmovaps %ymm3, %ymm13
-; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovaps %ymm1, %ymm12
-; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1]
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5]
+; AVX1-NEXT: vmovaps 224(%rdi), %ymm7
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm7[2,3,0,1]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
; AVX1-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovaps %ymm0, %ymm15
+; AVX1-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX1-NEXT: vmovaps %ymm1, %ymm13
+; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovaps %ymm0, %ymm12
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4]
; AVX1-NEXT: vmovaps 160(%rdi), %xmm0
-; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps 176(%rdi), %xmm1
-; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovaps 144(%rdi), %xmm4
-; AVX1-NEXT: vmovaps 128(%rdi), %xmm8
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
-; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,0]
+; AVX1-NEXT: vmovaps %xmm1, %xmm14
+; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps 144(%rdi), %xmm0
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps 128(%rdi), %xmm5
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,0]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovaps 64(%rdi), %ymm0
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovaps 96(%rdi), %ymm1
-; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT: vmovaps 96(%rdi), %ymm2
; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
+; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
-; AVX1-NEXT: vmovaps 32(%rdi), %xmm0
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 48(%rdi), %xmm2
+; AVX1-NEXT: vmovaps 32(%rdi), %xmm2
; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm0
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vmovaps (%rdi), %xmm2
; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps 16(%rdi), %xmm3
; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[4],ymm10[4],ymm6[5],ymm10[5]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5]
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm6[1,0],ymm5[5,4],ymm6[5,4]
+; AVX1-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm1 = ymm6[1,0],mem[1,0],ymm6[5,4],mem[5,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[1],xmm5[1],zero,zero
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm7 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[1],xmm4[1],zero,zero
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5]
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,0],ymm9[1,0],ymm15[5,4],ymm9[5,4]
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm13[1,0],ymm12[5,4],ymm13[5,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[1],xmm4[1],zero,zero
-; AVX1-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm7 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm5[1],xmm7[1],zero,zero
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm2[0],ymm14[1],ymm2[1],ymm14[4],ymm2[4],ymm14[5],ymm2[5]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm12[1,0],ymm3[5,4],ymm12[5,4]
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm2[0],ymm10[1],ymm2[1],ymm10[4],ymm2[4],ymm10[5],ymm2[5]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX1-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm1 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[1],xmm1[1],zero,zero
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm13[1],xmm1[1],zero,zero
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[1,0],ymm1[5,4],ymm13[5,4]
+; AVX1-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm7 = mem[0],xmm7[1],zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
+; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm15 = mem[0],xmm15[1],zero,zero
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm0 = ymm10[1],mem[1],ymm10[3],mem[3]
-; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm1 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7]
+; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm0 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[6],ymm6[6],ymm1[7],ymm6[7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm7 = zero,zero,xmm5[2],mem[0]
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm4[2],xmm8[2]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm1 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm1 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm5[2],xmm6[2]
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm15 = zero,zero,xmm4[2],mem[0]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm14[1],ymm2[3],ymm14[3]
-; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[6],ymm3[6],ymm12[7],ymm3[7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm4[2],xmm9[2]
-; AVX1-NEXT: vmovaps %xmm9, %xmm3
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
+; AVX1-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm7[2],xmm13[3],xmm7[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm9[2]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm0 = ymm3[1],mem[1],ymm3[3],mem[3]
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm12[1],ymm14[3],ymm12[3]
-; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm1 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm13[2],xmm15[2]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm5[2],xmm12[2]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm15 = xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7]
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[3,0],ymm1[7,4],ymm10[7,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm8[3,0],ymm1[7,4],ymm8[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm4 = xmm11[3,0],mem[3,0]
+; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm1 = xmm4[2],mem[2],xmm4[3],mem[3]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm11[3,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX1-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm4 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[2,3],ymm4[6,4],ymm1[6,7]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm5 = xmm2[3,0],mem[3,0]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3]
+; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm5 = xmm7[3,0],mem[3,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm5 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm5 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,0],ymm4[2,3],ymm5[6,4],ymm4[6,7]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3]
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
; AVX1-NEXT: # xmm6 = xmm2[3,0],mem[3,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,0],xmm5[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} ymm2 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm14[3,0],ymm13[3,0],ymm14[7,4],ymm13[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm15[2],xmm13[3],xmm15[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,0],xmm8[3,0]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm10[3,0],xmm9[3,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,0],xmm3[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -1297,7 +1301,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-NEXT: vmovaps %ymm3, 64(%rdx)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm3, 96(%rcx)
-; AVX1-NEXT: vmovaps %ymm7, (%rcx)
+; AVX1-NEXT: vmovaps %ymm15, (%rcx)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm3, 32(%rcx)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -1312,7 +1316,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX2-LABEL: load_i32_stride4_vf32:
; AVX2: # %bb.0:
-; AVX2-NEXT: subq $696, %rsp # imm = 0x2B8
+; AVX2-NEXT: subq $680, %rsp # imm = 0x2A8
; AVX2-NEXT: vmovaps 288(%rdi), %ymm12
; AVX2-NEXT: vmovaps 320(%rdi), %ymm5
; AVX2-NEXT: vmovaps 352(%rdi), %ymm6
@@ -1337,8 +1341,9 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpermps %ymm6, %ymm3, %ymm0
+; AVX2-NEXT: vmovaps %ymm6, %ymm7
; AVX2-NEXT: vpermps %ymm5, %ymm3, %ymm1
-; AVX2-NEXT: vmovaps %ymm5, %ymm9
+; AVX2-NEXT: vmovaps %ymm5, %ymm6
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-NEXT: vmovaps 272(%rdi), %xmm2
; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1375,7 +1380,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm4
; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1383,12 +1388,12 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836481,21474836481,21474836481,21474836481]
-; AVX2-NEXT: vpermps %ymm6, %ymm3, %ymm4
-; AVX2-NEXT: vmovaps %ymm6, %ymm2
+; AVX2-NEXT: vpermps %ymm7, %ymm3, %ymm4
+; AVX2-NEXT: vmovaps %ymm7, %ymm2
+; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps %ymm6, %ymm1
; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps %ymm9, %ymm1
-; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermps %ymm9, %ymm3, %ymm5
+; AVX2-NEXT: vpermps %ymm6, %ymm3, %ymm5
; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7]
; AVX2-NEXT: vmovaps 256(%rdi), %ymm0
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1405,29 +1410,28 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpermps %ymm10, %ymm3, %ymm5
; AVX2-NEXT: vpermps %ymm8, %ymm3, %ymm6
; AVX2-NEXT: vmovaps %ymm8, %ymm7
-; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm5[6,7]
; AVX2-NEXT: vmovaps 128(%rdi), %ymm0
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 176(%rdi), %xmm6
-; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps 176(%rdi), %xmm8
+; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vmovaps 160(%rdi), %xmm5
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm9
-; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm13[2,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm6[4,5,6,7]
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpermps %ymm11, %ymm3, %ymm6
; AVX2-NEXT: vpermps %ymm15, %ymm3, %ymm8
; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-NEXT: vmovaps (%rdi), %ymm13
-; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 32(%rdi), %xmm6
-; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 48(%rdi), %xmm0
+; AVX2-NEXT: vmovaps (%rdi), %ymm8
+; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 32(%rdi), %xmm0
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; AVX2-NEXT: vpermps %ymm13, %ymm4, %ymm13
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm8[2,3]
+; AVX2-NEXT: vmovaps 48(%rdi), %xmm6
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; AVX2-NEXT: vpermps %ymm8, %ymm4, %ymm13
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps %ymm14, %ymm9
@@ -1479,14 +1483,14 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm0
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3]
-; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
+; AVX2-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3]
+; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [30064771075,30064771075,30064771075,30064771075]
-; AVX2-NEXT: vpermps %ymm10, %ymm6, %ymm0
-; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm1
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm12 = [30064771075,30064771075,30064771075,30064771075]
+; AVX2-NEXT: vpermps %ymm10, %ymm12, %ymm0
+; AVX2-NEXT: vpermps %ymm7, %ymm12, %ymm1
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
; AVX2-NEXT: # xmm1 = xmm5[2],mem[2],xmm5[3],mem[3]
@@ -1494,24 +1498,23 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpermps %ymm11, %ymm6, %ymm0
-; AVX2-NEXT: vpermps %ymm15, %ymm6, %ymm2
+; AVX2-NEXT: vpermps %ymm11, %ymm12, %ymm0
+; AVX2-NEXT: vpermps %ymm15, %ymm12, %ymm2
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
-; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpermps %ymm9, %ymm6, %ymm2
-; AVX2-NEXT: vpermps %ymm8, %ymm6, %ymm7
-; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-NEXT: vpermps %ymm9, %ymm12, %ymm2
+; AVX2-NEXT: vpermps %ymm8, %ymm12, %ymm6
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload
; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
@@ -1545,7 +1548,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovaps %ymm2, 96(%r8)
; AVX2-NEXT: vmovaps %ymm0, (%r8)
; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
-; AVX2-NEXT: addq $696, %rsp # imm = 0x2B8
+; AVX2-NEXT: addq $680, %rsp # imm = 0x2A8
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index a1c8c8377cd9d..33f6530aa0b76 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -144,53 +144,53 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-LABEL: load_i32_stride6_vf4:
; SSE: # %bb.0:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movdqa 80(%rdi), %xmm9
+; SSE-NEXT: movdqa 80(%rdi), %xmm1
; SSE-NEXT: movdqa 64(%rdi), %xmm0
; SSE-NEXT: movdqa (%rdi), %xmm4
-; SSE-NEXT: movdqa 16(%rdi), %xmm12
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
; SSE-NEXT: movdqa 48(%rdi), %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[2,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[3,3,3,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[3,3,3,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
-; SSE-NEXT: movdqa 32(%rdi), %xmm7
-; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm10[0],xmm3[1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm11[0],xmm4[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1]
-; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[2,2,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm12[0],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,3,3,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
-; SSE-NEXT: movapd %xmm1, (%rsi)
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
+; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm4[0],xmm9[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movdqa 32(%rdi), %xmm4
+; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
+; SSE-NEXT: movapd %xmm9, (%rsi)
; SSE-NEXT: movapd %xmm3, (%rdx)
-; SSE-NEXT: movapd %xmm4, (%rcx)
-; SSE-NEXT: movapd %xmm6, (%r8)
-; SSE-NEXT: movapd %xmm2, (%r9)
+; SSE-NEXT: movapd %xmm5, (%rcx)
+; SSE-NEXT: movapd %xmm10, (%r8)
+; SSE-NEXT: movapd %xmm8, (%r9)
; SSE-NEXT: movapd %xmm0, (%rax)
; SSE-NEXT: retq
;
@@ -204,10 +204,10 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1],xmm3[2,3]
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,3]
; AVX1-NEXT: vmovaps 64(%rdi), %xmm5
-; AVX1-NEXT: vinsertps {{.*#+}} xmm8 = xmm4[0,1,2],xmm5[2]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[2]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[3,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[1,3]
-; AVX1-NEXT: vblendps {{.*#+}} xmm9 = xmm2[0,1,2],xmm5[3]
+; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,3]
@@ -217,21 +217,21 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[1]
; AVX1-NEXT: vmovaps 32(%rdi), %xmm1
; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm1[2,2,3,3]
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm4
-; AVX1-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0],xmm7[1],xmm4[2,3]
-; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm6[2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,2]
-; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm8
+; AVX1-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
+; AVX1-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1],xmm6[2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,0,2]
+; AVX1-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7]
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3]
; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
-; AVX1-NEXT: vmovaps %xmm8, (%rsi)
-; AVX1-NEXT: vmovaps %xmm9, (%rdx)
+; AVX1-NEXT: vmovaps %xmm4, (%rsi)
+; AVX1-NEXT: vmovaps %xmm2, (%rdx)
; AVX1-NEXT: vmovaps %xmm3, (%rcx)
; AVX1-NEXT: vmovaps %xmm0, (%r8)
-; AVX1-NEXT: vmovaps %xmm2, (%r9)
+; AVX1-NEXT: vmovaps %xmm7, (%r9)
; AVX1-NEXT: vmovaps %xmm1, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -246,7 +246,7 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpermd %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1,2],xmm5[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <1,7,5,u>
; AVX2-NEXT: vpermd %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
@@ -256,28 +256,28 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,3,2,3]
; AVX2-NEXT: vmovdqa 80(%rdi), %xmm7
-; AVX2-NEXT: vpbroadcastd %xmm7, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
+; AVX2-NEXT: vpbroadcastd %xmm7, %xmm8
+; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,3,3,4,5,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,3,3,4,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3]
-; AVX2-NEXT: vpbroadcastd 84(%rdi), %xmm6
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm7[2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
+; AVX2-NEXT: vpbroadcastd 84(%rdi), %xmm8
+; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <4,2,u,u>
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
; AVX2-NEXT: vpermd %ymm1, %ymm9, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <5,3,u,u>
-; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <5,3,u,u>
+; AVX2-NEXT: vpermd %ymm1, %ymm7, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
-; AVX2-NEXT: vmovdqa %xmm8, (%rsi)
+; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
; AVX2-NEXT: vmovdqa %xmm3, (%rdx)
-; AVX2-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX2-NEXT: vmovdqa %xmm6, (%rcx)
; AVX2-NEXT: vmovdqa %xmm5, (%r8)
; AVX2-NEXT: vmovdqa %xmm2, (%r9)
; AVX2-NEXT: vmovdqa %xmm1, (%rax)
@@ -286,52 +286,52 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-LABEL: load_i32_stride6_vf4:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa (%rdi), %xmm2
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm0
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT: vpextrd $2, %xmm0, %eax
-; AVX512-NEXT: vpinsrd $1, %eax, %xmm2, %xmm4
-; AVX512-NEXT: vmovd %xmm3, %eax
-; AVX512-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4
+; AVX512-NEXT: vpextrd $2, %xmm0, %r10d
+; AVX512-NEXT: vpinsrd $1, %r10d, %xmm2, %xmm4
+; AVX512-NEXT: vmovd %xmm3, %r10d
+; AVX512-NEXT: vpinsrd $2, %r10d, %xmm4, %xmm4
; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512-NEXT: vpextrd $2, %xmm5, %eax
-; AVX512-NEXT: vpinsrd $3, %eax, %xmm4, %xmm8
-; AVX512-NEXT: vpextrd $1, %xmm3, %eax
+; AVX512-NEXT: vpextrd $2, %xmm5, %r10d
+; AVX512-NEXT: vpinsrd $3, %r10d, %xmm4, %xmm4
+; AVX512-NEXT: vpextrd $1, %xmm3, %r10d
; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm0[2,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
-; AVX512-NEXT: vpinsrd $2, %eax, %xmm6, %xmm6
+; AVX512-NEXT: vpinsrd $2, %r10d, %xmm6, %xmm6
; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm1[0,1],xmm2[2,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,0,2,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3]
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512-NEXT: vmovd %xmm4, %eax
-; AVX512-NEXT: vpinsrd $3, %eax, %xmm7, %xmm7
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm8
+; AVX512-NEXT: vmovd %xmm8, %edi
+; AVX512-NEXT: vpinsrd $3, %edi, %xmm7, %xmm7
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
-; AVX512-NEXT: vpextrd $3, %xmm3, %eax
-; AVX512-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX512-NEXT: vpextrd $1, %xmm4, %eax
-; AVX512-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2
-; AVX512-NEXT: vpextrd $2, %xmm1, %eax
-; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm3
-; AVX512-NEXT: vmovd %xmm5, %eax
-; AVX512-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX512-NEXT: vpextrd $2, %xmm4, %eax
-; AVX512-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3
-; AVX512-NEXT: vpextrd $1, %xmm5, %eax
+; AVX512-NEXT: vpextrd $3, %xmm3, %edi
+; AVX512-NEXT: vpinsrd $2, %edi, %xmm2, %xmm2
+; AVX512-NEXT: vpextrd $1, %xmm8, %edi
+; AVX512-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2
+; AVX512-NEXT: vpextrd $2, %xmm1, %edi
+; AVX512-NEXT: vpinsrd $1, %edi, %xmm0, %xmm3
+; AVX512-NEXT: vmovd %xmm5, %edi
+; AVX512-NEXT: vpinsrd $2, %edi, %xmm3, %xmm3
+; AVX512-NEXT: vpextrd $2, %xmm8, %edi
+; AVX512-NEXT: vpinsrd $3, %edi, %xmm3, %xmm3
+; AVX512-NEXT: vpextrd $1, %xmm5, %edi
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
-; AVX512-NEXT: vmovdqa %xmm8, (%rsi)
+; AVX512-NEXT: vpinsrd $2, %edi, %xmm0, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3]
+; AVX512-NEXT: vmovdqa %xmm4, (%rsi)
; AVX512-NEXT: vmovdqa %xmm6, (%rdx)
; AVX512-NEXT: vmovdqa %xmm7, (%rcx)
; AVX512-NEXT: vmovdqa %xmm2, (%r8)
; AVX512-NEXT: vmovdqa %xmm3, (%r9)
-; AVX512-NEXT: vmovdqa %xmm0, (%r10)
+; AVX512-NEXT: vmovdqa %xmm0, (%rax)
; AVX512-NEXT: retq
%wide.vec = load <24 x i32>, ptr %in.vec, align 32
@@ -355,200 +355,198 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
; SSE-LABEL: load_i32_stride6_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa 144(%rdi), %xmm10
-; SSE-NEXT: movdqa 160(%rdi), %xmm14
-; SSE-NEXT: movdqa 96(%rdi), %xmm11
-; SSE-NEXT: movdqa 112(%rdi), %xmm5
-; SSE-NEXT: movdqa 64(%rdi), %xmm7
-; SSE-NEXT: movdqa (%rdi), %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rdi), %xmm2
-; SSE-NEXT: movdqa 48(%rdi), %xmm12
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; SSE-NEXT: movdqa %xmm2, %xmm8
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
-; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1]
-; SSE-NEXT: movdqa %xmm11, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
+; SSE-NEXT: movdqa 144(%rdi), %xmm12
+; SSE-NEXT: movdqa 160(%rdi), %xmm2
+; SSE-NEXT: movdqa 96(%rdi), %xmm5
+; SSE-NEXT: movdqa 112(%rdi), %xmm3
+; SSE-NEXT: movdqa 64(%rdi), %xmm6
+; SSE-NEXT: movdqa (%rdi), %xmm7
+; SSE-NEXT: movdqa 16(%rdi), %xmm1
+; SSE-NEXT: movdqa 48(%rdi), %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm11
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,3,2,3]
+; SSE-NEXT: movdqa %xmm7, %xmm8
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1]
+; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,1,1]
+; SSE-NEXT: movdqa %xmm5, %xmm8
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,2,3,3]
+; SSE-NEXT: movdqa %xmm4, %xmm11
+; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
+; SSE-NEXT: movdqa %xmm3, %xmm6
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT: movdqa %xmm12, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
+; SSE-NEXT: movdqa 80(%rdi), %xmm15
+; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm14[0],xmm12[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; SSE-NEXT: movdqa 32(%rdi), %xmm2
+; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1]
+; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1]
+; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,3,3]
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: movdqa 176(%rdi), %xmm14
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE-NEXT: movdqa 128(%rdi), %xmm8
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
+; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1]
+; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1]
+; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3]
+; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; SSE-NEXT: # xmm5 = mem[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3]
+; SSE-NEXT: movdqa %xmm6, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1]
-; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
+; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1]
+; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[1,1,1,1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,2,3,3]
-; SSE-NEXT: movdqa %xmm12, %xmm13
-; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
-; SSE-NEXT: movdqa %xmm4, %xmm7
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3]
-; SSE-NEXT: movdqa %xmm10, %xmm0
-; SSE-NEXT: movdqa %xmm10, %xmm9
; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
-; SSE-NEXT: movdqa 80(%rdi), %xmm10
-; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; SSE-NEXT: movdqa 32(%rdi), %xmm4
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1]
-; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3]
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: movdqa 176(%rdi), %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3]
-; SSE-NEXT: movdqa 128(%rdi), %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
-; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT: # xmm0 = mem[3,3,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
-; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
-; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3]
-; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: # xmm1 = mem[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
-; SSE-NEXT: movdqa %xmm7, %xmm6
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1]
-; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: # xmm4 = mem[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1]
-; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1]
-; SSE-NEXT: movapd %xmm15, 16(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movaps %xmm2, (%rsi)
-; SSE-NEXT: movapd %xmm9, 16(%rdx)
-; SSE-NEXT: movapd %xmm13, (%rdx)
-; SSE-NEXT: movapd %xmm5, 16(%rcx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movaps %xmm2, (%rcx)
-; SSE-NEXT: movapd %xmm11, 16(%r8)
-; SSE-NEXT: movapd %xmm12, (%r8)
+; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1]
+; SSE-NEXT: movapd %xmm10, 16(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, (%rsi)
+; SSE-NEXT: movapd %xmm12, 16(%rdx)
+; SSE-NEXT: movapd %xmm11, (%rdx)
+; SSE-NEXT: movapd %xmm13, 16(%rcx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, (%rcx)
+; SSE-NEXT: movapd %xmm4, 16(%r8)
+; SSE-NEXT: movapd %xmm7, (%r8)
; SSE-NEXT: movapd %xmm0, 16(%r9)
-; SSE-NEXT: movapd %xmm1, (%r9)
+; SSE-NEXT: movapd %xmm5, (%r9)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movapd %xmm14, 16(%rax)
-; SSE-NEXT: movapd %xmm7, (%rax)
+; SSE-NEXT: movapd %xmm9, 16(%rax)
+; SSE-NEXT: movapd %xmm6, (%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i32_stride6_vf8:
; AVX1: # %bb.0:
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: vmovaps 128(%rdi), %ymm10
+; AVX1-NEXT: vmovaps 128(%rdi), %ymm3
; AVX1-NEXT: vmovaps 160(%rdi), %ymm4
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm9
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm6
; AVX1-NEXT: vmovaps (%rdi), %ymm7
; AVX1-NEXT: vmovaps 96(%rdi), %ymm0
; AVX1-NEXT: vmovaps 64(%rdi), %ymm1
; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm5
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm5[0,0],ymm0[6,4],ymm5[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,2],ymm2[6,4],ymm5[6,6]
-; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm9[4,5],ymm7[6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm6
-; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm6[2,3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5],ymm3[6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm10[4,5],ymm4[6,7]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm3[2,3,0,1]
-; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm11[2,0],ymm3[0,0],ymm11[6,4],ymm3[4,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm9
+; AVX1-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1],xmm9[2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5],ymm10[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm11[2,0],ymm10[0,0],ymm11[6,4],ymm10[4,4]
; AVX1-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm12[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,0],ymm5[2,3],ymm12[6,4],ymm5[6,7]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,0],xmm6[3,0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[1,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5],ymm2[6,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm4[4,5],ymm10[6,7]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,0],ymm2[2,0],ymm3[4,4],ymm2[6,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm5
-; AVX1-NEXT: vshufps {{.*#+}} xmm9 = xmm7[2,0],xmm5[2,3]
-; AVX1-NEXT: vshufps {{.*#+}} ymm11 = ymm0[2,1],ymm1[2,0],ymm0[6,5],ymm1[6,4]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3,4],ymm9[5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm6[5,6,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[3,1],ymm3[4,5],ymm2[7,5]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,1],xmm5[3,3]
-; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm1[2,1],ymm0[7,5],ymm1[6,5]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4],ymm3[5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7]
+; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],xmm9[3,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[1,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5],ymm8[6,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,0],ymm10[1,0],ymm11[7,4],ymm10[5,4]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,0],ymm8[2,0],ymm9[4,4],ymm8[6,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm11
+; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,0],xmm11[2,3]
+; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm1[2,0],ymm0[6,5],ymm1[6,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4],ymm6[5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5,6,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[3,1],ymm9[4,5],ymm8[7,5]
+; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,1],xmm11[3,3]
+; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1],ymm1[2,1],ymm0[7,5],ymm1[6,5]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4],ymm7[5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4]
-; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,2],ymm5[2,0],ymm4[4,6],ymm5[6,4]
-; AVX1-NEXT: vmovaps 32(%rdi), %xmm7
-; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[2,2,3,3]
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm2
-; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0],xmm6[1],xmm2[2,3]
-; AVX1-NEXT: vmovapd 80(%rdi), %xmm10
-; AVX1-NEXT: vshufpd {{.*#+}} ymm12 = ymm10[1],ymm1[0],ymm10[2],ymm1[2]
-; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm12[2,0],ymm0[4,5],ymm12[6,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3,4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2],ymm8[2,0],ymm4[4,6],ymm8[6,4]
+; AVX1-NEXT: vmovaps 32(%rdi), %xmm9
+; AVX1-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,2,3,3]
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm11
+; AVX1-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
+; AVX1-NEXT: vmovapd 80(%rdi), %xmm12
+; AVX1-NEXT: vshufpd {{.*#+}} ymm13 = ymm12[1],ymm1[0],ymm12[2],ymm1[2]
+; AVX1-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,1],ymm13[2,0],ymm0[4,5],ymm13[6,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm4[1,0],ymm3[7,4],ymm4[5,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,3],ymm3[2,0],ymm4[4,7],ymm3[6,4]
-; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1],ymm1[1,3],ymm10[7,5],ymm1[5,7]
+; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm9[2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm1[1,3],ymm12[7,5],ymm1[5,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,0],ymm0[5,5],ymm1[6,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
-; AVX1-NEXT: vmovaps %ymm13, (%rsi)
-; AVX1-NEXT: vmovaps %ymm8, (%rdx)
-; AVX1-NEXT: vmovaps %ymm9, (%rcx)
-; AVX1-NEXT: vmovaps %ymm11, (%r8)
-; AVX1-NEXT: vmovaps %ymm5, (%r9)
+; AVX1-NEXT: vmovaps %ymm2, (%rsi)
+; AVX1-NEXT: vmovaps %ymm5, (%rdx)
+; AVX1-NEXT: vmovaps %ymm6, (%rcx)
+; AVX1-NEXT: vmovaps %ymm7, (%r8)
+; AVX1-NEXT: vmovaps %ymm8, (%r9)
; AVX1-NEXT: vmovaps %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -879,247 +877,242 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
; SSE-LABEL: load_i32_stride6_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: subq $376, %rsp # imm = 0x178
-; SSE-NEXT: movdqa 144(%rdi), %xmm9
-; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 160(%rdi), %xmm8
-; SSE-NEXT: movdqa 96(%rdi), %xmm12
-; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 112(%rdi), %xmm10
-; SSE-NEXT: movdqa 240(%rdi), %xmm11
-; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 256(%rdi), %xmm7
-; SSE-NEXT: movdqa 192(%rdi), %xmm0
+; SSE-NEXT: subq $360, %rsp # imm = 0x168
+; SSE-NEXT: movdqa 144(%rdi), %xmm8
+; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 160(%rdi), %xmm4
+; SSE-NEXT: movdqa 96(%rdi), %xmm11
+; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill
+; SSE-NEXT: movdqa 112(%rdi), %xmm5
+; SSE-NEXT: movdqa 240(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 208(%rdi), %xmm1
-; SSE-NEXT: movdqa 64(%rdi), %xmm6
-; SSE-NEXT: movdqa (%rdi), %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rdi), %xmm14
-; SSE-NEXT: movdqa 48(%rdi), %xmm5
-; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3]
-; SSE-NEXT: movdqa %xmm14, (%rsp) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm4, %xmm3
+; SSE-NEXT: movdqa 256(%rdi), %xmm7
+; SSE-NEXT: movdqa 192(%rdi), %xmm15
+; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 208(%rdi), %xmm14
+; SSE-NEXT: movdqa 64(%rdi), %xmm12
+; SSE-NEXT: movdqa (%rdi), %xmm9
+; SSE-NEXT: movdqa 16(%rdi), %xmm13
+; SSE-NEXT: movdqa 48(%rdi), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3]
+; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm9, %xmm3
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,3,3]
-; SSE-NEXT: movdqa %xmm6, %xmm15
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,3,3]
+; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1]
; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE-NEXT: movdqa %xmm1, %xmm13
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3]
+; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm15, %xmm3
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,2,3,3]
+; SSE-NEXT: movdqa %xmm7, %xmm10
; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,0,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1]
; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3]
-; SSE-NEXT: movdqa %xmm10, %xmm11
-; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm12, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; SSE-NEXT: movdqa %xmm5, %xmm7
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm11, %xmm3
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3]
-; SSE-NEXT: movdqa %xmm8, %xmm6
-; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
-; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 288(%rdi), %xmm9
-; SSE-NEXT: movdqa 304(%rdi), %xmm10
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3]
-; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm9, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3]
+; SSE-NEXT: movdqa %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 288(%rdi), %xmm2
+; SSE-NEXT: movdqa 304(%rdi), %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE-NEXT: movdqa 336(%rdi), %xmm12
+; SSE-NEXT: movdqa 336(%rdi), %xmm11
; SSE-NEXT: movdqa 352(%rdi), %xmm5
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3]
+; SSE-NEXT: movdqa %xmm5, %xmm15
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
-; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
+; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[3,3,3,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: movdqa %xmm1, %xmm5
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
+; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[3,3,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm7, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm10, %xmm5
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
+; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,3,3,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3]
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[3,3,3,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3]
-; SSE-NEXT: movdqa %xmm12, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
-; SSE-NEXT: movdqa 80(%rdi), %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: movdqa %xmm8, %xmm14
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3]
-; SSE-NEXT: movdqa 32(%rdi), %xmm4
+; SSE-NEXT: movdqa %xmm6, %xmm5
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
+; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3]
-; SSE-NEXT: movdqa 272(%rdi), %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3]
+; SSE-NEXT: movdqa %xmm11, %xmm5
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
+; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3]
+; SSE-NEXT: movdqa %xmm8, %xmm15
+; SSE-NEXT: movdqa 80(%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: movdqa %xmm9, %xmm13
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3]
+; SSE-NEXT: movdqa 32(%rdi), %xmm5
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3]
+; SSE-NEXT: movdqa 272(%rdi), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; SSE-NEXT: movdqa %xmm1, %xmm10
-; SSE-NEXT: movdqa 224(%rdi), %xmm4
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: movdqa 224(%rdi), %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
-; SSE-NEXT: movdqa 368(%rdi), %xmm15
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,0,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
+; SSE-NEXT: movdqa 368(%rdi), %xmm12
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3]
-; SSE-NEXT: movdqa 320(%rdi), %xmm11
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE-NEXT: movdqa 320(%rdi), %xmm7
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
-; SSE-NEXT: movdqa 176(%rdi), %xmm1
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
+; SSE-NEXT: movdqa 176(%rdi), %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: movdqa %xmm13, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3]
-; SSE-NEXT: movdqa 128(%rdi), %xmm8
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3]
+; SSE-NEXT: movdqa 128(%rdi), %xmm6
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
-; SSE-NEXT: movdqa %xmm5, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3]
+; SSE-NEXT: movdqa %xmm5, %xmm4
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: # xmm1 = mem[2,3,2,3]
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
-; SSE-NEXT: movdqa %xmm4, %xmm14
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,3,2,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1]
+; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3]
+; SSE-NEXT: movdqa %xmm1, %xmm13
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT: # xmm13 = mem[2,3,2,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1]
-; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,3,2,3]
+; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1]
-; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
-; SSE-NEXT: movdqa %xmm8, %xmm5
-; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1]
+; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3]
+; SSE-NEXT: movdqa %xmm6, %xmm3
+; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; SSE-NEXT: # xmm10 = mem[2,3,2,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1]
-; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
-; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
-; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE-NEXT: # xmm8 = mem[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3]
-; SSE-NEXT: movdqa %xmm14, %xmm7
+; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; SSE-NEXT: # xmm11 = mem[2,3,2,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1]
+; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
+; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; SSE-NEXT: # xmm6 = mem[0,0,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,0,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,3,3]
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,3,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE-NEXT: movapd %xmm0, %xmm11
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1]
+; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1]
+; SSE-NEXT: movapd %xmm14, %xmm3
; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3]
+; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[3,3,3,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = mem[2,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
-; SSE-NEXT: pshufd $85, (%rsp), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT: movapd %xmm9, %xmm14
+; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[1,1,1,1]
; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = mem[3,3,3,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = mem[2,3,2,3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
-; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
+; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: # xmm1 = mem[1,1,1,1]
; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
@@ -1127,57 +1120,56 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = mem[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1]
-; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1]
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 48(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 16(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 32(%rsi)
+; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1]
+; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, 48(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, 16(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, 32(%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, (%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 48(%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 16(%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 32(%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, (%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 16(%rcx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 48(%rcx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 32(%rcx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, (%rcx)
-; SSE-NEXT: movapd %xmm10, 16(%r8)
-; SSE-NEXT: movapd %xmm12, 48(%r8)
-; SSE-NEXT: movapd %xmm13, 32(%r8)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, (%r8)
-; SSE-NEXT: movapd %xmm3, 48(%r9)
-; SSE-NEXT: movapd %xmm5, 16(%r9)
-; SSE-NEXT: movapd %xmm6, 32(%r9)
-; SSE-NEXT: movapd %xmm8, (%r9)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, 48(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, 16(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, 32(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, (%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, 16(%rcx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, 48(%rcx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, 32(%rcx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, (%rcx)
+; SSE-NEXT: movapd %xmm11, 16(%r8)
+; SSE-NEXT: movapd %xmm8, 48(%r8)
+; SSE-NEXT: movapd %xmm10, 32(%r8)
+; SSE-NEXT: movapd %xmm15, (%r8)
+; SSE-NEXT: movapd %xmm0, 48(%r9)
+; SSE-NEXT: movapd %xmm4, 16(%r9)
+; SSE-NEXT: movapd %xmm5, 32(%r9)
+; SSE-NEXT: movapd %xmm6, (%r9)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movapd %xmm14, 16(%rax)
-; SSE-NEXT: movapd %xmm7, (%rax)
-; SSE-NEXT: movapd %xmm9, 32(%rax)
-; SSE-NEXT: movapd %xmm11, 48(%rax)
-; SSE-NEXT: addq $376, %rsp # imm = 0x178
+; SSE-NEXT: movapd %xmm13, 16(%rax)
+; SSE-NEXT: movapd %xmm9, (%rax)
+; SSE-NEXT: movapd %xmm14, 32(%rax)
+; SSE-NEXT: movapd %xmm3, 48(%rax)
+; SSE-NEXT: addq $360, %rsp # imm = 0x168
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i32_stride6_vf16:
; AVX1: # %bb.0:
-; AVX1-NEXT: subq $328, %rsp # imm = 0x148
-; AVX1-NEXT: vmovaps 96(%rdi), %ymm14
-; AVX1-NEXT: vmovaps 64(%rdi), %ymm6
-; AVX1-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: subq $360, %rsp # imm = 0x168
+; AVX1-NEXT: vmovaps 96(%rdi), %ymm6
+; AVX1-NEXT: vmovaps 64(%rdi), %ymm7
+; AVX1-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovaps 320(%rdi), %ymm2
-; AVX1-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovaps 352(%rdi), %ymm3
; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovaps 224(%rdi), %ymm4
@@ -1203,28 +1195,31 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm6, %ymm2
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm2[0,0],ymm14[6,4],ymm2[4,4]
+; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm7, %ymm2
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm6[2,0],ymm2[0,0],ymm6[6,4],ymm2[4,4]
+; AVX1-NEXT: vmovaps %ymm6, %ymm7
; AVX1-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm2[2,2],ymm1[6,4],ymm2[6,6]
; AVX1-NEXT: vmovaps 32(%rdi), %ymm11
; AVX1-NEXT: vmovaps (%rdi), %ymm8
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7]
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0
-; AVX1-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm0[2,3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm13[3,4,5],ymm7[6,7]
-; AVX1-NEXT: vmovaps 128(%rdi), %ymm6
+; AVX1-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1],xmm0[2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2],ymm13[3,4,5],ymm14[6,7]
+; AVX1-NEXT: vmovaps 128(%rdi), %ymm4
; AVX1-NEXT: vmovaps 160(%rdi), %ymm12
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm4[4,5],ymm12[6,7]
; AVX1-NEXT: vmovaps %ymm12, %ymm13
; AVX1-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovaps %ymm4, %ymm14
+; AVX1-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm4[2,0],ymm1[0,0],ymm4[6,4],ymm1[4,4]
; AVX1-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
-; AVX1-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,0],ymm2[1,0],ymm14[7,4],ymm2[5,4]
-; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm7[2,0],ymm2[2,3],ymm7[6,4],ymm2[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7]
+; AVX1-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,0],ymm2[1,0],ymm7[7,4],ymm2[5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm2[2,3],ymm6[6,4],ymm2[6,7]
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,2],xmm0[1,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5],ymm0[6,7]
@@ -1232,8 +1227,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm9[1,0],ymm12[7,4],ymm9[5,4]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,0],ymm9[1,0],ymm6[7,4],ymm9[5,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,3],ymm0[6,4],ymm9[6,7]
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,0],xmm15[3,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm15[1,3]
@@ -1244,15 +1239,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7]
-; AVX1-NEXT: vmovaps %ymm14, %ymm9
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm14[2,1],ymm10[2,0],ymm14[6,5],ymm10[6,4]
+; AVX1-NEXT: vmovaps %ymm7, %ymm9
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm7[2,1],ymm15[2,0],ymm7[6,5],ymm15[6,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,0],xmm2[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4],ymm3[5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm13[4,5],ymm6[6,7]
-; AVX1-NEXT: vmovaps %ymm6, %ymm14
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm3[2,0],ymm4[4,4],ymm3[6,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
@@ -1260,105 +1254,106 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
; AVX1-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4]
+; AVX1-NEXT: vmovaps %ymm6, %ymm10
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,1],ymm12[2,0],ymm6[6,5],ymm12[6,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm5[2,0],xmm6[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4],ymm7[5,6,7]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX1-NEXT: vblendps $207, (%rsp), %ymm13, %ymm7 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm7 = mem[0,1,2,3],ymm13[4,5],mem[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,0],ymm7[2,0],ymm8[4,4],ymm7[6,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[3,1],ymm4[4,5],ymm3[7,5]
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1],xmm2[3,3]
-; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,1],ymm10[2,1],ymm9[7,5],ymm10[6,5]
-; AVX1-NEXT: vmovaps %ymm10, %ymm11
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,1],ymm15[2,1],ymm9[7,5],ymm15[6,5]
+; AVX1-NEXT: vmovaps %ymm15, %ymm0
+; AVX1-NEXT: vmovaps %ymm9, %ymm15
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm3[5,6,7]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1],ymm7[3,1],ymm8[4,5],ymm7[7,5]
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm5[3,1],xmm6[3,3]
-; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,1],ymm15[2,1],ymm12[7,5],ymm15[6,5]
-; AVX1-NEXT: vmovaps %ymm12, %ymm10
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm10[3,1],ymm12[2,1],ymm10[7,5],ymm12[6,5]
+; AVX1-NEXT: vmovaps %ymm12, %ymm1
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4],ymm3[5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm2 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
-; AVX1-NEXT: vmovaps 32(%rdi), %xmm14
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm14[2,2,3,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
+; AVX1-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX1-NEXT: vmovaps 32(%rdi), %xmm3
+; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[2,2,3,3]
; AVX1-NEXT: vmovaps 16(%rdi), %xmm5
; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
; AVX1-NEXT: vmovapd 80(%rdi), %xmm6
-; AVX1-NEXT: vshufpd {{.*#+}} ymm7 = ymm6[1],ymm11[0],ymm6[2],ymm11[2]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm7 = ymm6[1],ymm0[0],ymm6[2],ymm0[2]
; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,0],ymm9[4,5],ymm7[6,4]
-; AVX1-NEXT: vmovaps %ymm9, %ymm12
; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,2],ymm8[2,0],ymm7[4,6],ymm8[6,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vblendps $12, (%rsp), %ymm13, %ymm4 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm4 = ymm13[0,1],mem[2,3],ymm13[4,5,6,7]
-; AVX1-NEXT: vmovaps 224(%rdi), %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; AVX1-NEXT: vmovaps 208(%rdi), %xmm3
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX1-NEXT: vmovapd 272(%rdi), %xmm8
-; AVX1-NEXT: vshufpd {{.*#+}} ymm9 = ymm8[1],ymm15[0],ymm8[2],ymm15[2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5,6,7]
+; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7]
+; AVX1-NEXT: vmovaps 224(%rdi), %xmm8
+; AVX1-NEXT: vpermilps {{.*#+}} xmm11 = xmm8[2,2,3,3]
+; AVX1-NEXT: vmovaps 208(%rdi), %xmm12
+; AVX1-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3]
+; AVX1-NEXT: vmovapd 272(%rdi), %xmm14
+; AVX1-NEXT: vshufpd {{.*#+}} ymm9 = ymm14[1],ymm1[0],ymm14[2],ymm1[2]
; AVX1-NEXT: vmovaps %ymm10, %ymm13
; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[2,3,0,1]
-; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm4[2,0],ymm9[0,0],ymm4[6,4],ymm9[4,4]
-; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,2],ymm10[2,0],ymm9[4,6],ymm10[6,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm4[2,0],ymm11[0,0],ymm4[6,4],ymm11[4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,2],ymm10[2,0],ymm11[4,6],ymm10[6,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm7[1,0],ymm2[7,4],ymm7[5,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,3],ymm2[2,0],ymm7[4,7],ymm2[6,4]
-; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,3]
-; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1],ymm11[1,3],ymm6[7,5],ymm11[5,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,1],ymm6[2,0],ymm12[5,5],ymm6[6,4]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm9[1,0],ymm4[7,4],ymm9[5,4]
-; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,3],ymm4[2,0],ymm9[4,7],ymm4[6,4]
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,1],ymm15[1,3],ymm8[7,5],ymm15[5,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm13[1,1],ymm3[2,0],ymm13[5,5],ymm3[6,4]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm3, (%rsi)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm3, 32(%rsi)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm3, 32(%rdx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm3, (%rdx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm3, 32(%rcx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm3, (%rcx)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm3, 32(%r8)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm3, (%r8)
-; AVX1-NEXT: vmovaps %ymm1, 32(%r9)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm1, (%r9)
+; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1],ymm0[1,3],ymm6[7,5],ymm0[5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,1],ymm5[2,0],ymm15[5,5],ymm5[6,4]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm11[1,0],ymm4[7,4],ymm11[5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,3],ymm3[2,0],ymm11[4,7],ymm3[6,4]
+; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm8[2,3]
+; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm14[3,1],ymm1[1,3],ymm14[7,5],ymm1[5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1],ymm5[2,0],ymm13[5,5],ymm5[6,4]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%rsi)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, 32(%rsi)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%rdx)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, 32(%rcx)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%rcx)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, 32(%r8)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%r8)
+; AVX1-NEXT: vmovaps %ymm9, 32(%r9)
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%r9)
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: vmovaps %ymm0, 32(%rax)
+; AVX1-NEXT: vmovaps %ymm3, 32(%rax)
; AVX1-NEXT: vmovaps %ymm2, (%rax)
-; AVX1-NEXT: addq $328, %rsp # imm = 0x148
+; AVX1-NEXT: addq $360, %rsp # imm = 0x168
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll
index 924b6f22cd02f..ed176f2780156 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll
@@ -120,34 +120,34 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
; SSE-LABEL: load_i64_stride2_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm6
-; SSE-NEXT: movaps 16(%rdi), %xmm8
-; SSE-NEXT: movaps 32(%rdi), %xmm4
-; SSE-NEXT: movaps 48(%rdi), %xmm9
-; SSE-NEXT: movaps 80(%rdi), %xmm10
+; SSE-NEXT: movaps (%rdi), %xmm0
+; SSE-NEXT: movaps 16(%rdi), %xmm1
+; SSE-NEXT: movaps 32(%rdi), %xmm2
+; SSE-NEXT: movaps 48(%rdi), %xmm3
+; SSE-NEXT: movaps 80(%rdi), %xmm4
; SSE-NEXT: movaps 64(%rdi), %xmm5
-; SSE-NEXT: movaps 112(%rdi), %xmm11
+; SSE-NEXT: movaps 112(%rdi), %xmm6
; SSE-NEXT: movaps 96(%rdi), %xmm7
-; SSE-NEXT: movaps %xmm7, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0]
-; SSE-NEXT: movaps %xmm5, %xmm3
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0]
-; SSE-NEXT: movaps %xmm4, %xmm2
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0]
-; SSE-NEXT: movaps %xmm6, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm9[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1]
-; SSE-NEXT: movaps %xmm3, 32(%rsi)
-; SSE-NEXT: movaps %xmm0, (%rsi)
-; SSE-NEXT: movaps %xmm1, 48(%rsi)
-; SSE-NEXT: movaps %xmm2, 16(%rsi)
+; SSE-NEXT: movaps %xmm7, %xmm8
+; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0]
+; SSE-NEXT: movaps %xmm5, %xmm9
+; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0]
+; SSE-NEXT: movaps %xmm2, %xmm10
+; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm3[0]
+; SSE-NEXT: movaps %xmm0, %xmm11
+; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: movaps %xmm9, 32(%rsi)
+; SSE-NEXT: movaps %xmm11, (%rsi)
+; SSE-NEXT: movaps %xmm8, 48(%rsi)
+; SSE-NEXT: movaps %xmm10, 16(%rsi)
; SSE-NEXT: movaps %xmm5, 32(%rdx)
-; SSE-NEXT: movaps %xmm6, (%rdx)
+; SSE-NEXT: movaps %xmm0, (%rdx)
; SSE-NEXT: movaps %xmm7, 48(%rdx)
-; SSE-NEXT: movaps %xmm4, 16(%rdx)
+; SSE-NEXT: movaps %xmm2, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i64_stride2_vf8:
@@ -216,8 +216,8 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
; SSE-LABEL: load_i64_stride2_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm9
-; SSE-NEXT: movaps 32(%rdi), %xmm14
+; SSE-NEXT: movaps (%rdi), %xmm0
+; SSE-NEXT: movaps 32(%rdi), %xmm1
; SSE-NEXT: movaps 48(%rdi), %xmm8
; SSE-NEXT: movaps 208(%rdi), %xmm10
; SSE-NEXT: movaps 192(%rdi), %xmm2
@@ -227,18 +227,18 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; SSE-NEXT: movaps 64(%rdi), %xmm6
; SSE-NEXT: movaps 240(%rdi), %xmm13
; SSE-NEXT: movaps 224(%rdi), %xmm4
-; SSE-NEXT: movaps 176(%rdi), %xmm15
+; SSE-NEXT: movaps 176(%rdi), %xmm14
; SSE-NEXT: movaps 160(%rdi), %xmm5
-; SSE-NEXT: movaps 112(%rdi), %xmm1
+; SSE-NEXT: movaps 112(%rdi), %xmm15
; SSE-NEXT: movaps 96(%rdi), %xmm7
-; SSE-NEXT: movaps %xmm7, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1]
-; SSE-NEXT: movaps %xmm5, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1]
-; SSE-NEXT: movaps %xmm4, %xmm15
-; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0]
+; SSE-NEXT: movaps %xmm7, %xmm9
+; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm15[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1]
+; SSE-NEXT: movaps %xmm5, %xmm15
+; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1]
+; SSE-NEXT: movaps %xmm4, %xmm14
+; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1]
; SSE-NEXT: movaps %xmm2, %xmm13
; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm10[0]
@@ -249,23 +249,23 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; SSE-NEXT: movaps %xmm6, %xmm11
; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1]
-; SSE-NEXT: movaps %xmm14, %xmm12
+; SSE-NEXT: movaps %xmm1, %xmm12
; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm8[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm8[1]
-; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1]
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 16(%rdi), %xmm8
-; SSE-NEXT: movaps %xmm9, %xmm14
-; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm8[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1]
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
; SSE-NEXT: movaps %xmm13, 96(%rsi)
; SSE-NEXT: movaps %xmm10, 64(%rsi)
; SSE-NEXT: movaps %xmm11, 32(%rsi)
-; SSE-NEXT: movaps %xmm14, (%rsi)
-; SSE-NEXT: movaps %xmm15, 112(%rsi)
-; SSE-NEXT: movaps %xmm1, 80(%rsi)
-; SSE-NEXT: movaps %xmm0, 48(%rsi)
+; SSE-NEXT: movaps %xmm1, (%rsi)
+; SSE-NEXT: movaps %xmm14, 112(%rsi)
+; SSE-NEXT: movaps %xmm15, 80(%rsi)
+; SSE-NEXT: movaps %xmm9, 48(%rsi)
; SSE-NEXT: movaps %xmm12, 16(%rsi)
-; SSE-NEXT: movaps %xmm9, (%rdx)
+; SSE-NEXT: movaps %xmm0, (%rdx)
; SSE-NEXT: movaps %xmm6, 32(%rdx)
; SSE-NEXT: movaps %xmm3, 64(%rdx)
; SSE-NEXT: movaps %xmm2, 96(%rdx)
@@ -380,61 +380,61 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; SSE-LABEL: load_i64_stride2_vf32:
; SSE: # %bb.0:
; SSE-NEXT: subq $152, %rsp
-; SSE-NEXT: movaps 272(%rdi), %xmm15
+; SSE-NEXT: movaps 272(%rdi), %xmm8
; SSE-NEXT: movaps 208(%rdi), %xmm9
-; SSE-NEXT: movaps 192(%rdi), %xmm5
+; SSE-NEXT: movaps 192(%rdi), %xmm2
; SSE-NEXT: movaps 144(%rdi), %xmm10
-; SSE-NEXT: movaps 128(%rdi), %xmm4
-; SSE-NEXT: movaps 80(%rdi), %xmm13
-; SSE-NEXT: movaps 64(%rdi), %xmm3
-; SSE-NEXT: movaps 304(%rdi), %xmm14
-; SSE-NEXT: movaps 288(%rdi), %xmm12
-; SSE-NEXT: movaps 240(%rdi), %xmm2
-; SSE-NEXT: movaps 224(%rdi), %xmm11
-; SSE-NEXT: movaps 176(%rdi), %xmm7
-; SSE-NEXT: movaps 160(%rdi), %xmm8
-; SSE-NEXT: movaps 112(%rdi), %xmm0
-; SSE-NEXT: movaps 96(%rdi), %xmm6
-; SSE-NEXT: movaps %xmm6, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
-; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm3, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0]
-; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm13[1]
+; SSE-NEXT: movaps 128(%rdi), %xmm1
+; SSE-NEXT: movaps 80(%rdi), %xmm12
+; SSE-NEXT: movaps 64(%rdi), %xmm0
+; SSE-NEXT: movaps 304(%rdi), %xmm11
+; SSE-NEXT: movaps 288(%rdi), %xmm6
+; SSE-NEXT: movaps 240(%rdi), %xmm13
+; SSE-NEXT: movaps 224(%rdi), %xmm5
+; SSE-NEXT: movaps 176(%rdi), %xmm15
+; SSE-NEXT: movaps 160(%rdi), %xmm4
+; SSE-NEXT: movaps 112(%rdi), %xmm14
+; SSE-NEXT: movaps 96(%rdi), %xmm3
+; SSE-NEXT: movaps %xmm3, %xmm7
+; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm14[0]
+; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1]
; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm8, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; SSE-NEXT: movaps %xmm0, %xmm3
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0]
+; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1]
-; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm4, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm15[1]
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm11, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1]
-; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1]
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm5, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1]
; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm12, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm14[0]
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0]
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1]
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm6, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1]
-; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1]
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 256(%rdi), %xmm0
; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 368(%rdi), %xmm0
; SSE-NEXT: movaps 352(%rdi), %xmm15
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
index 6b194a077320c..9173b17a18107 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
@@ -172,9 +172,9 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
; SSE-LABEL: load_i64_stride3_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movapd 128(%rdi), %xmm14
-; SSE-NEXT: movapd 176(%rdi), %xmm13
-; SSE-NEXT: movapd 80(%rdi), %xmm12
+; SSE-NEXT: movapd 128(%rdi), %xmm2
+; SSE-NEXT: movapd 176(%rdi), %xmm1
+; SSE-NEXT: movapd 80(%rdi), %xmm0
; SSE-NEXT: movapd 96(%rdi), %xmm4
; SSE-NEXT: movapd 112(%rdi), %xmm8
; SSE-NEXT: movapd 144(%rdi), %xmm3
@@ -184,34 +184,34 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movapd 32(%rdi), %xmm5
; SSE-NEXT: movapd 48(%rdi), %xmm7
; SSE-NEXT: movapd 64(%rdi), %xmm11
-; SSE-NEXT: movapd %xmm11, %xmm15
-; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm7[0],xmm15[1]
-; SSE-NEXT: movapd %xmm9, %xmm1
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
-; SSE-NEXT: movapd %xmm8, %xmm2
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
-; SSE-NEXT: movapd %xmm10, %xmm0
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1]
-; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm12[0]
-; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm13[0]
-; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm14[0]
+; SSE-NEXT: movapd %xmm11, %xmm12
+; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm7[0],xmm12[1]
+; SSE-NEXT: movapd %xmm9, %xmm13
+; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm3[0],xmm13[1]
+; SSE-NEXT: movapd %xmm8, %xmm14
+; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1]
+; SSE-NEXT: movapd %xmm10, %xmm15
+; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm6[0],xmm15[1]
+; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm0[0]
+; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0]
+; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm2[0]
; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm5[0]
-; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1]
-; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm9[0],xmm13[1]
-; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm8[0],xmm14[1]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm8[0],xmm2[1]
; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1]
-; SSE-NEXT: movapd %xmm2, 32(%rsi)
-; SSE-NEXT: movapd %xmm0, (%rsi)
-; SSE-NEXT: movapd %xmm1, 48(%rsi)
-; SSE-NEXT: movapd %xmm15, 16(%rsi)
+; SSE-NEXT: movapd %xmm14, 32(%rsi)
+; SSE-NEXT: movapd %xmm15, (%rsi)
+; SSE-NEXT: movapd %xmm13, 48(%rsi)
+; SSE-NEXT: movapd %xmm12, 16(%rsi)
; SSE-NEXT: movapd %xmm4, 32(%rdx)
; SSE-NEXT: movapd %xmm6, (%rdx)
; SSE-NEXT: movapd %xmm3, 48(%rdx)
; SSE-NEXT: movapd %xmm7, 16(%rdx)
-; SSE-NEXT: movapd %xmm14, 32(%rcx)
+; SSE-NEXT: movapd %xmm2, 32(%rcx)
; SSE-NEXT: movapd %xmm5, (%rcx)
-; SSE-NEXT: movapd %xmm13, 48(%rcx)
-; SSE-NEXT: movapd %xmm12, 16(%rcx)
+; SSE-NEXT: movapd %xmm1, 48(%rcx)
+; SSE-NEXT: movapd %xmm0, 16(%rcx)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i64_stride3_vf8:
@@ -327,52 +327,50 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-LABEL: load_i64_stride3_vf16:
; SSE: # %bb.0:
; SSE-NEXT: subq $24, %rsp
-; SSE-NEXT: movaps 224(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movapd 224(%rdi), %xmm2
; SSE-NEXT: movapd 128(%rdi), %xmm0
-; SSE-NEXT: movapd 272(%rdi), %xmm6
-; SSE-NEXT: movapd 176(%rdi), %xmm5
+; SSE-NEXT: movapd 272(%rdi), %xmm4
+; SSE-NEXT: movapd 176(%rdi), %xmm3
; SSE-NEXT: movapd 80(%rdi), %xmm1
-; SSE-NEXT: movapd 192(%rdi), %xmm7
+; SSE-NEXT: movapd 192(%rdi), %xmm5
; SSE-NEXT: movapd 208(%rdi), %xmm11
-; SSE-NEXT: movapd 96(%rdi), %xmm8
+; SSE-NEXT: movapd 96(%rdi), %xmm6
; SSE-NEXT: movapd 112(%rdi), %xmm12
-; SSE-NEXT: movapd 240(%rdi), %xmm4
+; SSE-NEXT: movapd 240(%rdi), %xmm7
; SSE-NEXT: movapd 256(%rdi), %xmm13
-; SSE-NEXT: movapd 144(%rdi), %xmm15
-; SSE-NEXT: movapd 160(%rdi), %xmm9
-; SSE-NEXT: movapd 48(%rdi), %xmm14
-; SSE-NEXT: movapd 64(%rdi), %xmm3
-; SSE-NEXT: movapd %xmm3, %xmm2
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm14[0],xmm2[1]
-; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill
-; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0]
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
+; SSE-NEXT: movapd 144(%rdi), %xmm10
+; SSE-NEXT: movapd 160(%rdi), %xmm14
+; SSE-NEXT: movapd 48(%rdi), %xmm9
+; SSE-NEXT: movapd 64(%rdi), %xmm15
+; SSE-NEXT: movapd %xmm15, %xmm8
+; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1]
+; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill
+; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm1[0]
+; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1]
; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movapd %xmm9, %xmm10
-; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1]
-; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm5[0]
-; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm9[0],xmm5[1]
-; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movapd %xmm12, %xmm9
-; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1]
-; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm0[0]
-; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movapd %xmm14, %xmm15
+; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1]
+; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm3[0]
+; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1]
+; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movapd %xmm12, %xmm14
+; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm6[0],xmm14[1]
+; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm0[0]
+; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1]
; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movapd %xmm13, %xmm12
-; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1]
-; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm6[0]
+; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm7[0],xmm12[1]
+; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0]
+; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm13[0],xmm4[1]
; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm13[0],xmm6[1]
-; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movapd %xmm11, %xmm13
-; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm7[0],xmm13[1]
-; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm0[0]
-; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1]
-; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm5[0],xmm13[1]
+; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm2[0]
+; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm11[0],xmm2[1]
+; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movapd 336(%rdi), %xmm11
; SSE-NEXT: movapd 352(%rdi), %xmm1
; SSE-NEXT: movapd %xmm1, %xmm8
@@ -396,11 +394,11 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1]
; SSE-NEXT: movapd %xmm2, 96(%rsi)
; SSE-NEXT: movapd %xmm13, 64(%rsi)
-; SSE-NEXT: movapd %xmm9, 32(%rsi)
+; SSE-NEXT: movapd %xmm14, 32(%rsi)
; SSE-NEXT: movapd %xmm5, (%rsi)
; SSE-NEXT: movapd %xmm8, 112(%rsi)
; SSE-NEXT: movapd %xmm12, 80(%rsi)
-; SSE-NEXT: movapd %xmm10, 48(%rsi)
+; SSE-NEXT: movapd %xmm15, 48(%rsi)
; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 16(%rsi)
; SSE-NEXT: movapd %xmm1, 96(%rdx)
@@ -412,8 +410,8 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movapd %xmm11, 112(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 80(%rdx)
-; SSE-NEXT: movapd %xmm15, 48(%rdx)
-; SSE-NEXT: movapd %xmm14, 16(%rdx)
+; SSE-NEXT: movapd %xmm10, 48(%rdx)
+; SSE-NEXT: movapd %xmm9, 16(%rdx)
; SSE-NEXT: movapd %xmm0, 96(%rcx)
; SSE-NEXT: movapd %xmm6, 112(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -508,15 +506,15 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,3,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7]
-; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm9[0,3,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7]
-; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm2
; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,3,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
-; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm4
; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm5[0,3,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7]
@@ -541,29 +539,29 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vmovaps 16(%rdi), %xmm0
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vmovaps 304(%rdi), %xmm1
-; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vmovaps 208(%rdi), %xmm2
-; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT: vmovaps 16(%rdi), %xmm9
+; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vmovaps 304(%rdi), %xmm10
+; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = mem[0,1,0,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-NEXT: vmovaps 208(%rdi), %xmm11
+; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = mem[0,1,0,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
; AVX2-NEXT: vmovaps %ymm4, 64(%rsi)
-; AVX2-NEXT: vmovaps %ymm15, 96(%rsi)
-; AVX2-NEXT: vmovaps %ymm14, (%rsi)
-; AVX2-NEXT: vmovaps %ymm13, 32(%rsi)
+; AVX2-NEXT: vmovaps %ymm2, 96(%rsi)
+; AVX2-NEXT: vmovaps %ymm1, (%rsi)
+; AVX2-NEXT: vmovaps %ymm0, 32(%rsi)
; AVX2-NEXT: vmovaps %ymm6, 96(%rdx)
; AVX2-NEXT: vmovaps %ymm8, (%rdx)
; AVX2-NEXT: vmovaps %ymm5, 32(%rdx)
; AVX2-NEXT: vmovaps %ymm3, 64(%rdx)
-; AVX2-NEXT: vmovaps %ymm2, 64(%rcx)
-; AVX2-NEXT: vmovaps %ymm1, 96(%rcx)
-; AVX2-NEXT: vmovaps %ymm0, (%rcx)
+; AVX2-NEXT: vmovaps %ymm11, 64(%rcx)
+; AVX2-NEXT: vmovaps %ymm10, 96(%rcx)
+; AVX2-NEXT: vmovaps %ymm9, (%rcx)
; AVX2-NEXT: vmovaps %ymm7, 32(%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
index 46f0f4a10e504..c039b8e160db6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
@@ -94,28 +94,28 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i64_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
; SSE-LABEL: load_i64_stride4_vf4:
; SSE: # %bb.0:
-; SSE-NEXT: movaps 112(%rdi), %xmm8
+; SSE-NEXT: movaps 112(%rdi), %xmm0
; SSE-NEXT: movaps 80(%rdi), %xmm1
; SSE-NEXT: movaps (%rdi), %xmm2
; SSE-NEXT: movaps 16(%rdi), %xmm3
; SSE-NEXT: movaps 32(%rdi), %xmm4
-; SSE-NEXT: movaps 48(%rdi), %xmm9
+; SSE-NEXT: movaps 48(%rdi), %xmm5
; SSE-NEXT: movaps 96(%rdi), %xmm6
; SSE-NEXT: movaps 64(%rdi), %xmm7
-; SSE-NEXT: movaps %xmm7, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSE-NEXT: movaps %xmm2, %xmm5
-; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE-NEXT: movaps %xmm7, %xmm8
+; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0]
+; SSE-NEXT: movaps %xmm2, %xmm9
+; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
; SSE-NEXT: movaps %xmm1, %xmm4
-; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
; SSE-NEXT: movaps %xmm3, %xmm6
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1]
-; SSE-NEXT: movaps %xmm0, 16(%rsi)
-; SSE-NEXT: movaps %xmm5, (%rsi)
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1]
+; SSE-NEXT: movaps %xmm8, 16(%rsi)
+; SSE-NEXT: movaps %xmm9, (%rsi)
; SSE-NEXT: movaps %xmm7, 16(%rdx)
; SSE-NEXT: movaps %xmm2, (%rdx)
; SSE-NEXT: movaps %xmm4, 16(%rcx)
@@ -202,181 +202,181 @@ define void @load_i64_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
; SSE-LABEL: load_i64_stride4_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movaps 112(%rdi), %xmm8
-; SSE-NEXT: movaps 80(%rdi), %xmm14
-; SSE-NEXT: movaps 240(%rdi), %xmm9
-; SSE-NEXT: movaps 208(%rdi), %xmm13
+; SSE-NEXT: movaps 112(%rdi), %xmm5
+; SSE-NEXT: movaps 80(%rdi), %xmm3
+; SSE-NEXT: movaps 240(%rdi), %xmm6
+; SSE-NEXT: movaps 208(%rdi), %xmm2
; SSE-NEXT: movaps 176(%rdi), %xmm10
-; SSE-NEXT: movaps 144(%rdi), %xmm12
+; SSE-NEXT: movaps 144(%rdi), %xmm1
; SSE-NEXT: movaps (%rdi), %xmm4
-; SSE-NEXT: movaps 16(%rdi), %xmm11
-; SSE-NEXT: movaps 32(%rdi), %xmm15
-; SSE-NEXT: movaps 224(%rdi), %xmm2
+; SSE-NEXT: movaps 16(%rdi), %xmm0
+; SSE-NEXT: movaps 32(%rdi), %xmm12
+; SSE-NEXT: movaps 224(%rdi), %xmm13
; SSE-NEXT: movaps 192(%rdi), %xmm7
-; SSE-NEXT: movaps 96(%rdi), %xmm3
-; SSE-NEXT: movaps 64(%rdi), %xmm6
-; SSE-NEXT: movaps 160(%rdi), %xmm1
-; SSE-NEXT: movaps 128(%rdi), %xmm5
-; SSE-NEXT: movaps %xmm5, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
-; SSE-NEXT: movaps %xmm6, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
-; SSE-NEXT: movaps %xmm7, %xmm3
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
-; SSE-NEXT: movaps %xmm4, %xmm2
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm15[1]
-; SSE-NEXT: movaps %xmm12, %xmm15
-; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm10[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1]
-; SSE-NEXT: movaps %xmm14, %xmm10
-; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm8[1]
-; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm13, %xmm8
-; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1]
-; SSE-NEXT: movaps 48(%rdi), %xmm9
-; SSE-NEXT: movaps %xmm11, %xmm14
-; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm9[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1]
-; SSE-NEXT: movaps %xmm3, 48(%rsi)
-; SSE-NEXT: movaps %xmm1, 16(%rsi)
-; SSE-NEXT: movaps %xmm0, 32(%rsi)
-; SSE-NEXT: movaps %xmm2, (%rsi)
+; SSE-NEXT: movaps 96(%rdi), %xmm14
+; SSE-NEXT: movaps 64(%rdi), %xmm9
+; SSE-NEXT: movaps 160(%rdi), %xmm15
+; SSE-NEXT: movaps 128(%rdi), %xmm8
+; SSE-NEXT: movaps %xmm8, %xmm11
+; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm15[1]
+; SSE-NEXT: movaps %xmm9, %xmm15
+; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1]
+; SSE-NEXT: movaps %xmm7, %xmm14
+; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1]
+; SSE-NEXT: movaps %xmm4, %xmm13
+; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1]
+; SSE-NEXT: movaps %xmm1, %xmm12
+; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1]
+; SSE-NEXT: movaps %xmm3, %xmm10
+; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm5[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1]
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm2, %xmm5
+; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1]
+; SSE-NEXT: movaps 48(%rdi), %xmm6
+; SSE-NEXT: movaps %xmm0, %xmm3
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1]
+; SSE-NEXT: movaps %xmm14, 48(%rsi)
+; SSE-NEXT: movaps %xmm15, 16(%rsi)
+; SSE-NEXT: movaps %xmm11, 32(%rsi)
+; SSE-NEXT: movaps %xmm13, (%rsi)
; SSE-NEXT: movaps %xmm7, 48(%rdx)
-; SSE-NEXT: movaps %xmm6, 16(%rdx)
+; SSE-NEXT: movaps %xmm9, 16(%rdx)
; SSE-NEXT: movaps %xmm4, (%rdx)
-; SSE-NEXT: movaps %xmm5, 32(%rdx)
+; SSE-NEXT: movaps %xmm8, 32(%rdx)
; SSE-NEXT: movaps %xmm10, 16(%rcx)
-; SSE-NEXT: movaps %xmm8, 48(%rcx)
-; SSE-NEXT: movaps %xmm15, 32(%rcx)
-; SSE-NEXT: movaps %xmm14, (%rcx)
-; SSE-NEXT: movaps %xmm13, 48(%r8)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 16(%r8)
-; SSE-NEXT: movaps %xmm12, 32(%r8)
-; SSE-NEXT: movaps %xmm11, (%r8)
+; SSE-NEXT: movaps %xmm5, 48(%rcx)
+; SSE-NEXT: movaps %xmm12, 32(%rcx)
+; SSE-NEXT: movaps %xmm3, (%rcx)
+; SSE-NEXT: movaps %xmm2, 48(%r8)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: movaps %xmm2, 16(%r8)
+; SSE-NEXT: movaps %xmm1, 32(%r8)
+; SSE-NEXT: movaps %xmm0, (%r8)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i64_stride4_vf8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps 224(%rdi), %ymm10
-; AVX1-NEXT: vmovaps 192(%rdi), %ymm11
-; AVX1-NEXT: vmovaps 96(%rdi), %ymm13
-; AVX1-NEXT: vmovaps 64(%rdi), %ymm14
+; AVX1-NEXT: vmovaps 224(%rdi), %ymm3
+; AVX1-NEXT: vmovaps 192(%rdi), %ymm2
+; AVX1-NEXT: vmovaps 96(%rdi), %ymm4
+; AVX1-NEXT: vmovaps 64(%rdi), %ymm5
; AVX1-NEXT: vmovaps 32(%rdi), %xmm6
; AVX1-NEXT: vmovaps (%rdi), %xmm7
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm6[0]
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 160(%rdi), %xmm0
-; AVX1-NEXT: vmovaps 128(%rdi), %xmm3
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm0[0]
-; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 224(%rdi), %xmm1
-; AVX1-NEXT: vmovaps 192(%rdi), %xmm2
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vmovaps 96(%rdi), %xmm4
-; AVX1-NEXT: vmovaps 64(%rdi), %xmm5
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm5[0],xmm4[0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1]
+; AVX1-NEXT: vmovaps 160(%rdi), %xmm8
+; AVX1-NEXT: vmovaps 128(%rdi), %xmm9
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm8[0]
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1]
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX1-NEXT: vmovaps 48(%rdi), %xmm5
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm6
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
+; AVX1-NEXT: vmovaps 224(%rdi), %xmm10
+; AVX1-NEXT: vmovaps 192(%rdi), %xmm11
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0]
+; AVX1-NEXT: vmovaps 96(%rdi), %xmm13
+; AVX1-NEXT: vmovaps 64(%rdi), %xmm14
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm11[1],xmm10[1]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm13[1]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm11
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm13
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm14 = xmm13[0],xmm11[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX1-NEXT: vmovaps 176(%rdi), %xmm1
; AVX1-NEXT: vmovaps 144(%rdi), %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm10[1],ymm11[3],ymm10[3]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm13[1],xmm11[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: vmovaps %xmm15, 16(%rsi)
; AVX1-NEXT: vmovaps %xmm12, 48(%rsi)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm1, 32(%rsi)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm1, (%rsi)
-; AVX1-NEXT: vmovaps %xmm2, 16(%rdx)
-; AVX1-NEXT: vmovaps %xmm3, 48(%rdx)
-; AVX1-NEXT: vmovaps %xmm8, (%rdx)
+; AVX1-NEXT: vmovaps %xmm9, 16(%rdx)
+; AVX1-NEXT: vmovaps %xmm7, 48(%rdx)
+; AVX1-NEXT: vmovaps %xmm6, (%rdx)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm1, 32(%rdx)
-; AVX1-NEXT: vmovaps %ymm7, 32(%rcx)
-; AVX1-NEXT: vmovaps %ymm4, (%rcx)
+; AVX1-NEXT: vmovaps %ymm8, 32(%rcx)
+; AVX1-NEXT: vmovaps %ymm10, (%rcx)
; AVX1-NEXT: vmovaps %ymm0, 32(%r8)
-; AVX1-NEXT: vmovaps %ymm5, (%r8)
+; AVX1-NEXT: vmovaps %ymm4, (%r8)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_i64_stride4_vf8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps 160(%rdi), %ymm9
-; AVX2-NEXT: vmovaps 128(%rdi), %ymm10
-; AVX2-NEXT: vmovaps 32(%rdi), %ymm12
-; AVX2-NEXT: vmovaps (%rdi), %ymm13
-; AVX2-NEXT: vmovaps 96(%rdi), %ymm14
-; AVX2-NEXT: vmovaps 64(%rdi), %ymm15
+; AVX2-NEXT: vmovaps 160(%rdi), %ymm11
+; AVX2-NEXT: vmovaps 128(%rdi), %ymm8
+; AVX2-NEXT: vmovaps 32(%rdi), %ymm2
+; AVX2-NEXT: vmovaps (%rdi), %ymm3
+; AVX2-NEXT: vmovaps 96(%rdi), %ymm4
+; AVX2-NEXT: vmovaps 64(%rdi), %ymm5
; AVX2-NEXT: vmovaps 224(%rdi), %xmm6
; AVX2-NEXT: vmovaps 192(%rdi), %xmm7
; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm6[0]
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 160(%rdi), %xmm0
-; AVX2-NEXT: vmovaps 128(%rdi), %xmm1
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 96(%rdi), %xmm2
-; AVX2-NEXT: vmovaps (%rdi), %xmm3
-; AVX2-NEXT: vmovaps 32(%rdi), %xmm4
-; AVX2-NEXT: vmovaps 64(%rdi), %xmm5
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1]
-; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm2[0]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm1[1],xmm0[1]
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm3[0],xmm4[0]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm2[1]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
-; AVX2-NEXT: vmovaps 224(%rdi), %ymm5
+; AVX2-NEXT: vmovaps 160(%rdi), %xmm9
+; AVX2-NEXT: vmovaps 128(%rdi), %xmm10
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0]
+; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps 96(%rdi), %xmm12
+; AVX2-NEXT: vmovaps (%rdi), %xmm13
+; AVX2-NEXT: vmovaps 32(%rdi), %xmm14
+; AVX2-NEXT: vmovaps 64(%rdi), %xmm15
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1]
+; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm15[0],xmm12[0]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm9[1]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm13[0],xmm14[0]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm15[1],xmm12[1]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3]
+; AVX2-NEXT: vmovaps 224(%rdi), %ymm15
; AVX2-NEXT: vmovaps 192(%rdi), %ymm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3]
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3]
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[2],ymm11[2]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm11[1],ymm8[3],ymm11[3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
; AVX2-NEXT: vmovaps %xmm7, 16(%rsi)
-; AVX2-NEXT: vmovaps %xmm6, (%rsi)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-NEXT: vmovaps %xmm5, 32(%rsi)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-NEXT: vmovaps %xmm5, 48(%rsi)
-; AVX2-NEXT: vmovaps %xmm3, (%rdx)
-; AVX2-NEXT: vmovaps %xmm2, 16(%rdx)
-; AVX2-NEXT: vmovaps %xmm8, 32(%rdx)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-NEXT: vmovaps %xmm2, 48(%rdx)
+; AVX2-NEXT: vmovaps %xmm10, (%rsi)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-NEXT: vmovaps %xmm3, 32(%rsi)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-NEXT: vmovaps %xmm3, 48(%rsi)
+; AVX2-NEXT: vmovaps %xmm13, (%rdx)
+; AVX2-NEXT: vmovaps %xmm12, 16(%rdx)
+; AVX2-NEXT: vmovaps %xmm9, 32(%rdx)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-NEXT: vmovaps %xmm3, 48(%rdx)
; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
-; AVX2-NEXT: vmovaps %ymm4, (%rcx)
+; AVX2-NEXT: vmovaps %ymm14, (%rcx)
; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
-; AVX2-NEXT: vmovaps %ymm11, (%r8)
+; AVX2-NEXT: vmovaps %ymm2, (%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -434,61 +434,61 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-LABEL: load_i64_stride4_vf16:
; SSE: # %bb.0:
; SSE-NEXT: subq $152, %rsp
-; SSE-NEXT: movaps (%rdi), %xmm8
-; SSE-NEXT: movaps 416(%rdi), %xmm13
-; SSE-NEXT: movaps 384(%rdi), %xmm7
-; SSE-NEXT: movaps 288(%rdi), %xmm15
-; SSE-NEXT: movaps 256(%rdi), %xmm9
+; SSE-NEXT: movaps (%rdi), %xmm5
+; SSE-NEXT: movaps 416(%rdi), %xmm0
+; SSE-NEXT: movaps 384(%rdi), %xmm6
+; SSE-NEXT: movaps 288(%rdi), %xmm1
+; SSE-NEXT: movaps 256(%rdi), %xmm7
; SSE-NEXT: movaps 160(%rdi), %xmm2
; SSE-NEXT: movaps 128(%rdi), %xmm10
; SSE-NEXT: movaps 480(%rdi), %xmm3
; SSE-NEXT: movaps 448(%rdi), %xmm11
; SSE-NEXT: movaps 352(%rdi), %xmm4
; SSE-NEXT: movaps 320(%rdi), %xmm12
-; SSE-NEXT: movaps 224(%rdi), %xmm5
-; SSE-NEXT: movaps 192(%rdi), %xmm14
-; SSE-NEXT: movaps 96(%rdi), %xmm6
-; SSE-NEXT: movaps 64(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm14, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1]
+; SSE-NEXT: movaps 224(%rdi), %xmm8
+; SSE-NEXT: movaps 192(%rdi), %xmm13
+; SSE-NEXT: movaps 96(%rdi), %xmm9
+; SSE-NEXT: movaps 64(%rdi), %xmm14
+; SSE-NEXT: movaps %xmm14, %xmm15
+; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0]
+; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1]
; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm12, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm13, %xmm9
+; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0]
+; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm8[1]
+; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm12, %xmm8
+; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0]
+; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1]
; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm11, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT: movaps %xmm11, %xmm4
+; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1]
; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm10, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm10, %xmm3
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1]
; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm9, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1]
-; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm7, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1]
+; SSE-NEXT: movaps %xmm7, %xmm2
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1]
; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm6, %xmm1
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 32(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm8, %xmm15
+; SSE-NEXT: movaps %xmm5, %xmm15
; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
-; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
+; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 112(%rdi), %xmm0
; SSE-NEXT: movaps 80(%rdi), %xmm1
; SSE-NEXT: movaps %xmm1, %xmm2
@@ -586,48 +586,48 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-LABEL: load_i64_stride4_vf16:
; AVX1: # %bb.0:
; AVX1-NEXT: subq $296, %rsp # imm = 0x128
-; AVX1-NEXT: vmovaps 224(%rdi), %xmm8
-; AVX1-NEXT: vmovaps 192(%rdi), %xmm9
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm8[0]
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 96(%rdi), %xmm10
-; AVX1-NEXT: vmovaps 64(%rdi), %xmm11
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0]
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps 224(%rdi), %xmm0
+; AVX1-NEXT: vmovaps 192(%rdi), %xmm1
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps 96(%rdi), %xmm2
+; AVX1-NEXT: vmovaps 64(%rdi), %xmm3
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps 352(%rdi), %xmm4
; AVX1-NEXT: vmovaps 320(%rdi), %xmm5
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm4[0]
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 160(%rdi), %xmm12
-; AVX1-NEXT: vmovaps 128(%rdi), %xmm1
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm12[0]
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 32(%rdi), %xmm2
-; AVX1-NEXT: vmovaps (%rdi), %xmm3
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0]
+; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps 160(%rdi), %xmm7
+; AVX1-NEXT: vmovaps 128(%rdi), %xmm8
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm7[0]
+; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps 32(%rdi), %xmm9
+; AVX1-NEXT: vmovaps (%rdi), %xmm10
; AVX1-NEXT: vmovaps 288(%rdi), %xmm6
-; AVX1-NEXT: vmovaps 256(%rdi), %xmm0
+; AVX1-NEXT: vmovaps 256(%rdi), %xmm11
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm6[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm11[0],xmm6[0]
; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1]
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0]
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],xmm6[1]
+; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm9[0]
+; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps 416(%rdi), %xmm0
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1]
; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
; AVX1-NEXT: vmovaps 480(%rdi), %xmm1
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],xmm10[1]
-; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 448(%rdi), %xmm4
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm1[0]
-; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1]
+; AVX1-NEXT: vmovaps 448(%rdi), %xmm2
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm10[1],xmm9[1]
+; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps 384(%rdi), %xmm1
; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
@@ -731,53 +731,53 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-LABEL: load_i64_stride4_vf16:
; AVX2: # %bb.0:
; AVX2-NEXT: subq $296, %rsp # imm = 0x128
-; AVX2-NEXT: vmovaps 224(%rdi), %xmm11
-; AVX2-NEXT: vmovaps 192(%rdi), %xmm15
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm15[0],xmm11[0]
-; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 96(%rdi), %xmm7
-; AVX2-NEXT: vmovaps (%rdi), %xmm9
-; AVX2-NEXT: vmovaps 64(%rdi), %xmm4
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm7[0]
+; AVX2-NEXT: vmovaps 224(%rdi), %xmm0
+; AVX2-NEXT: vmovaps 192(%rdi), %xmm6
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm0[0]
+; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps 96(%rdi), %xmm9
+; AVX2-NEXT: vmovaps (%rdi), %xmm11
+; AVX2-NEXT: vmovaps 64(%rdi), %xmm10
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0]
+; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps 352(%rdi), %xmm4
+; AVX2-NEXT: vmovaps 320(%rdi), %xmm5
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps 160(%rdi), %xmm8
+; AVX2-NEXT: vmovaps 128(%rdi), %xmm12
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm8[0]
+; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps 288(%rdi), %xmm13
+; AVX2-NEXT: vmovaps 256(%rdi), %xmm14
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0]
+; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1]
+; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps 416(%rdi), %xmm15
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm13[1]
+; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps 384(%rdi), %xmm13
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm0[1]
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 352(%rdi), %xmm8
-; AVX2-NEXT: vmovaps 320(%rdi), %xmm6
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm8[0]
+; AVX2-NEXT: vmovaps 480(%rdi), %xmm0
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm8[1]
+; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps 448(%rdi), %xmm12
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm9[1]
+; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm0[0]
+; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm0[1]
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 160(%rdi), %xmm3
-; AVX2-NEXT: vmovaps 128(%rdi), %xmm2
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm13[0],xmm15[0]
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 288(%rdi), %xmm1
-; AVX2-NEXT: vmovaps 256(%rdi), %xmm5
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0]
-; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm8[1]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm15[1]
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 416(%rdi), %xmm0
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1]
-; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 384(%rdi), %xmm1
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm15[1],xmm11[1]
-; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 480(%rdi), %xmm11
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 448(%rdi), %xmm2
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm7[1]
-; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm11[0]
-; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1]
-; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX2-NEXT: vmovaps 32(%rdi), %xmm2
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm2[0]
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 32(%rdi), %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm0[0]
-; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm0[1]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm2[1]
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vmovaps 160(%rdi), %ymm9
; AVX2-NEXT: vmovaps 128(%rdi), %ymm8
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
index 3ebc7a41d8f66..df2886a72e34d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
@@ -103,54 +103,54 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-LABEL: load_i64_stride6_vf4:
; SSE: # %bb.0:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movaps 176(%rdi), %xmm8
-; SSE-NEXT: movaps 128(%rdi), %xmm12
-; SSE-NEXT: movaps 80(%rdi), %xmm9
-; SSE-NEXT: movaps 160(%rdi), %xmm10
-; SSE-NEXT: movaps 112(%rdi), %xmm14
-; SSE-NEXT: movaps 64(%rdi), %xmm11
+; SSE-NEXT: movaps 176(%rdi), %xmm4
+; SSE-NEXT: movaps 128(%rdi), %xmm0
+; SSE-NEXT: movaps 80(%rdi), %xmm6
+; SSE-NEXT: movaps 160(%rdi), %xmm7
+; SSE-NEXT: movaps 112(%rdi), %xmm2
+; SSE-NEXT: movaps 64(%rdi), %xmm8
; SSE-NEXT: movaps (%rdi), %xmm5
; SSE-NEXT: movaps 16(%rdi), %xmm3
-; SSE-NEXT: movaps 32(%rdi), %xmm13
-; SSE-NEXT: movaps 48(%rdi), %xmm6
-; SSE-NEXT: movaps 144(%rdi), %xmm7
-; SSE-NEXT: movaps 96(%rdi), %xmm4
-; SSE-NEXT: movaps %xmm4, %xmm15
-; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0]
-; SSE-NEXT: movaps %xmm5, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; SSE-NEXT: movaps %xmm3, %xmm6
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0]
-; SSE-NEXT: movaps %xmm14, %xmm7
-; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm10[1]
-; SSE-NEXT: movaps %xmm13, %xmm2
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0]
-; SSE-NEXT: movaps %xmm12, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm8[1]
-; SSE-NEXT: movaps %xmm15, 16(%rsi)
-; SSE-NEXT: movaps %xmm1, (%rsi)
-; SSE-NEXT: movaps %xmm4, 16(%rdx)
+; SSE-NEXT: movaps 32(%rdi), %xmm1
+; SSE-NEXT: movaps 48(%rdi), %xmm9
+; SSE-NEXT: movaps 144(%rdi), %xmm10
+; SSE-NEXT: movaps 96(%rdi), %xmm11
+; SSE-NEXT: movaps %xmm11, %xmm12
+; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0]
+; SSE-NEXT: movaps %xmm5, %xmm13
+; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm10[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1]
+; SSE-NEXT: movaps %xmm3, %xmm9
+; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0]
+; SSE-NEXT: movaps %xmm2, %xmm10
+; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm7[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1]
+; SSE-NEXT: movaps %xmm1, %xmm7
+; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0]
+; SSE-NEXT: movaps %xmm0, %xmm8
+; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE-NEXT: movaps %xmm12, 16(%rsi)
+; SSE-NEXT: movaps %xmm13, (%rsi)
+; SSE-NEXT: movaps %xmm11, 16(%rdx)
; SSE-NEXT: movaps %xmm5, (%rdx)
-; SSE-NEXT: movaps %xmm7, 16(%rcx)
-; SSE-NEXT: movaps %xmm6, (%rcx)
-; SSE-NEXT: movaps %xmm14, 16(%r8)
+; SSE-NEXT: movaps %xmm10, 16(%rcx)
+; SSE-NEXT: movaps %xmm9, (%rcx)
+; SSE-NEXT: movaps %xmm2, 16(%r8)
; SSE-NEXT: movaps %xmm3, (%r8)
-; SSE-NEXT: movaps %xmm0, 16(%r9)
-; SSE-NEXT: movaps %xmm2, (%r9)
-; SSE-NEXT: movaps %xmm12, 16(%rax)
-; SSE-NEXT: movaps %xmm13, (%rax)
+; SSE-NEXT: movaps %xmm8, 16(%r9)
+; SSE-NEXT: movaps %xmm7, (%r9)
+; SSE-NEXT: movaps %xmm0, 16(%rax)
+; SSE-NEXT: movaps %xmm1, (%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i64_stride6_vf4:
; AVX1: # %bb.0:
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: vmovaps 160(%rdi), %ymm8
+; AVX1-NEXT: vmovaps 160(%rdi), %ymm0
; AVX1-NEXT: vmovaps 96(%rdi), %ymm1
; AVX1-NEXT: vmovaps 128(%rdi), %ymm2
; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm3
@@ -158,34 +158,34 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-NEXT: vmovaps (%rdi), %xmm5
; AVX1-NEXT: vmovaps 16(%rdi), %xmm6
; AVX1-NEXT: vmovaps 32(%rdi), %xmm7
-; AVX1-NEXT: vmovaps 48(%rdi), %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm0[0]
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm8
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm8[0]
; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm0[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm2
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX1-NEXT: vmovaps 64(%rdi), %xmm5
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm6[0],xmm5[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm2
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm8[0],ymm2[2],ymm8[2]
-; AVX1-NEXT: vmovaps 80(%rdi), %xmm6
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm6[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm8[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm3
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
+; AVX1-NEXT: vmovaps 64(%rdi), %xmm8
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm6[0],xmm8[0]
; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm8[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm3
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX1-NEXT: vmovaps 80(%rdi), %xmm8
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm8[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm8[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovaps %ymm4, (%rsi)
-; AVX1-NEXT: vmovaps %ymm0, (%rdx)
-; AVX1-NEXT: vmovaps %ymm3, (%rcx)
+; AVX1-NEXT: vmovaps %ymm2, (%rdx)
+; AVX1-NEXT: vmovaps %ymm5, (%rcx)
; AVX1-NEXT: vmovaps %ymm1, (%r8)
-; AVX1-NEXT: vmovaps %ymm5, (%r9)
-; AVX1-NEXT: vmovaps %ymm2, (%rax)
+; AVX1-NEXT: vmovaps %ymm6, (%r9)
+; AVX1-NEXT: vmovaps %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -202,30 +202,30 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm6[0]
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
; AVX2-NEXT: vbroadcastsd 104(%rdi), %ymm8
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm1[1],ymm8[3],ymm1[3]
; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1]
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm6
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm6[0],ymm2[2],ymm6[2]
-; AVX2-NEXT: vmovaps 64(%rdi), %xmm7
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm7[0]
-; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1]
+; AVX2-NEXT: vmovaps 64(%rdi), %xmm8
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm8[0]
+; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1]
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vmovaps 80(%rdi), %xmm4
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm4[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm4[0]
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm7
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm8
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3]
; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vmovaps %ymm9, (%rsi)
+; AVX2-NEXT: vmovaps %ymm7, (%rsi)
; AVX2-NEXT: vmovaps %ymm3, (%rdx)
; AVX2-NEXT: vmovaps %ymm6, (%rcx)
; AVX2-NEXT: vmovaps %ymm2, (%r8)
@@ -297,63 +297,61 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
; SSE-LABEL: load_i64_stride6_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: subq $40, %rsp
-; SSE-NEXT: movaps 160(%rdi), %xmm8
+; SSE-NEXT: subq $24, %rsp
+; SSE-NEXT: movaps 160(%rdi), %xmm9
; SSE-NEXT: movaps 112(%rdi), %xmm0
-; SSE-NEXT: movaps 352(%rdi), %xmm1
-; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE-NEXT: movaps 352(%rdi), %xmm8
; SSE-NEXT: movaps 256(%rdi), %xmm12
-; SSE-NEXT: movaps 208(%rdi), %xmm9
-; SSE-NEXT: movaps 64(%rdi), %xmm7
-; SSE-NEXT: movaps (%rdi), %xmm11
-; SSE-NEXT: movaps 16(%rdi), %xmm10
-; SSE-NEXT: movaps 48(%rdi), %xmm2
-; SSE-NEXT: movaps 336(%rdi), %xmm3
-; SSE-NEXT: movaps 288(%rdi), %xmm14
-; SSE-NEXT: movaps 144(%rdi), %xmm4
+; SSE-NEXT: movaps 208(%rdi), %xmm1
+; SSE-NEXT: movaps 64(%rdi), %xmm15
+; SSE-NEXT: movaps (%rdi), %xmm3
+; SSE-NEXT: movaps 16(%rdi), %xmm2
+; SSE-NEXT: movaps 48(%rdi), %xmm10
+; SSE-NEXT: movaps 336(%rdi), %xmm14
+; SSE-NEXT: movaps 288(%rdi), %xmm4
+; SSE-NEXT: movaps 144(%rdi), %xmm13
; SSE-NEXT: movaps 96(%rdi), %xmm5
-; SSE-NEXT: movaps 240(%rdi), %xmm1
-; SSE-NEXT: movaps 192(%rdi), %xmm13
-; SSE-NEXT: movaps %xmm13, %xmm6
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0]
+; SSE-NEXT: movaps 240(%rdi), %xmm11
+; SSE-NEXT: movaps 192(%rdi), %xmm6
+; SSE-NEXT: movaps %xmm6, %xmm7
+; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0]
+; SSE-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1]
; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1]
-; SSE-NEXT: movaps %xmm5, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE-NEXT: movaps %xmm5, %xmm11
+; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1]
; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm14, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm11, %xmm15
-; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1]
-; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm10, %xmm2
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0]
+; SSE-NEXT: movaps %xmm4, %xmm13
+; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm14[1]
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm3, %xmm14
+; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm10[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1]
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm2, %xmm3
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0]
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1]
-; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm9, %xmm11
-; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1]
-; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm1, %xmm15
+; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm12[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1]
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm0, %xmm12
-; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm8[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 304(%rdi), %xmm8
-; SSE-NEXT: movaps %xmm8, %xmm9
-; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
+; SSE-NEXT: movaps 304(%rdi), %xmm7
+; SSE-NEXT: movaps %xmm7, %xmm9
+; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1]
; SSE-NEXT: movaps 80(%rdi), %xmm1
-; SSE-NEXT: movaps 32(%rdi), %xmm7
-; SSE-NEXT: movaps %xmm7, %xmm10
+; SSE-NEXT: movaps 32(%rdi), %xmm8
+; SSE-NEXT: movaps %xmm8, %xmm10
; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1]
; SSE-NEXT: movaps 272(%rdi), %xmm1
; SSE-NEXT: movaps 224(%rdi), %xmm3
; SSE-NEXT: movaps %xmm3, %xmm6
@@ -369,25 +367,25 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE-NEXT: movaps %xmm14, 48(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, 16(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm13, 48(%rsi)
+; SSE-NEXT: movaps %xmm11, 16(%rsi)
+; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 32(%rsi)
-; SSE-NEXT: movaps %xmm15, (%rsi)
+; SSE-NEXT: movaps %xmm14, (%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 48(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 16(%rdx)
-; SSE-NEXT: movaps %xmm13, 32(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, 32(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, (%rdx)
; SSE-NEXT: movaps %xmm12, 16(%rcx)
; SSE-NEXT: movaps %xmm9, 48(%rcx)
-; SSE-NEXT: movaps %xmm11, 32(%rcx)
+; SSE-NEXT: movaps %xmm15, 32(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, (%rcx)
-; SSE-NEXT: movaps %xmm8, 48(%r8)
+; SSE-NEXT: movaps %xmm7, 48(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 16(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -402,177 +400,178 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movaps %xmm0, 48(%rax)
; SSE-NEXT: movaps %xmm4, 16(%rax)
; SSE-NEXT: movaps %xmm3, 32(%rax)
-; SSE-NEXT: movaps %xmm7, (%rax)
-; SSE-NEXT: addq $40, %rsp
+; SSE-NEXT: movaps %xmm8, (%rax)
+; SSE-NEXT: addq $24, %rsp
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i64_stride6_vf8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps 352(%rdi), %ymm10
-; AVX1-NEXT: vmovaps 288(%rdi), %ymm12
-; AVX1-NEXT: vmovaps 96(%rdi), %ymm13
+; AVX1-NEXT: vmovaps 352(%rdi), %ymm0
+; AVX1-NEXT: vmovaps 288(%rdi), %ymm4
+; AVX1-NEXT: vmovaps 96(%rdi), %ymm7
; AVX1-NEXT: vmovaps 128(%rdi), %ymm3
; AVX1-NEXT: vmovaps 320(%rdi), %ymm5
; AVX1-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm6
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
-; AVX1-NEXT: vmovaps 240(%rdi), %xmm2
-; AVX1-NEXT: vmovaps 192(%rdi), %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm2[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vmovaps 240(%rdi), %xmm8
+; AVX1-NEXT: vmovaps 192(%rdi), %xmm9
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm8[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm10
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
+; AVX1-NEXT: vmovaps (%rdi), %xmm11
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm12
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm13
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm14 = xmm11[0],xmm13[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm8
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX1-NEXT: vmovaps (%rdi), %xmm1
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm4
-; AVX1-NEXT: vmovaps 48(%rdi), %xmm7
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm7[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
-; AVX1-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm13[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm0[0],ymm13[2],ymm0[2]
-; AVX1-NEXT: vmovaps 64(%rdi), %xmm2
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm2[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm1
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],ymm1[0],ymm12[2],ymm1[2]
-; AVX1-NEXT: vmovaps 256(%rdi), %xmm3
-; AVX1-NEXT: vmovaps 208(%rdi), %xmm5
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm3[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX1-NEXT: vmovaps 160(%rdi), %ymm7
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3]
-; AVX1-NEXT: vmovaps 32(%rdi), %xmm13
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm2[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm2
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm7[0],ymm2[2],ymm7[2]
-; AVX1-NEXT: vmovaps 80(%rdi), %xmm4
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm13[0],xmm4[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm5
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm10[0],ymm5[2],ymm10[2]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm8[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm9
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
+; AVX1-NEXT: vmovaps 64(%rdi), %xmm10
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm12[0],xmm10[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm11
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm11[0],ymm4[2],ymm11[2]
+; AVX1-NEXT: vmovaps 256(%rdi), %xmm13
+; AVX1-NEXT: vmovaps 208(%rdi), %xmm14
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
+; AVX1-NEXT: vmovaps 160(%rdi), %ymm15
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3]
+; AVX1-NEXT: vmovaps 32(%rdi), %xmm9
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm12[1],xmm10[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm11[1],ymm4[3],ymm11[3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm14[1],xmm13[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm10
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm15[0],ymm10[2],ymm15[2]
+; AVX1-NEXT: vmovaps 80(%rdi), %xmm12
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm13 = xmm9[0],xmm12[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm13
+; AVX1-NEXT: vmovaps %ymm0, %ymm3
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm0[0],ymm13[2],ymm0[2]
; AVX1-NEXT: vmovaps 272(%rdi), %xmm1
; AVX1-NEXT: vmovaps 224(%rdi), %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm4[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm10[1],ymm5[3],ymm10[3]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm15[1],ymm10[3],ymm15[3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],ymm3[1],ymm13[3],ymm3[3]
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm1, (%rsi)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm1, 32(%rsi)
-; AVX1-NEXT: vmovaps %ymm15, 32(%rdx)
+; AVX1-NEXT: vmovaps %ymm5, 32(%rdx)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm1, (%rdx)
-; AVX1-NEXT: vmovaps %ymm9, 32(%rcx)
-; AVX1-NEXT: vmovaps %ymm11, (%rcx)
-; AVX1-NEXT: vmovaps %ymm6, 32(%r8)
-; AVX1-NEXT: vmovaps %ymm8, (%r8)
-; AVX1-NEXT: vmovaps %ymm12, 32(%r9)
-; AVX1-NEXT: vmovaps %ymm3, (%r9)
+; AVX1-NEXT: vmovaps %ymm8, 32(%rcx)
+; AVX1-NEXT: vmovaps %ymm6, (%rcx)
+; AVX1-NEXT: vmovaps %ymm4, 32(%r8)
+; AVX1-NEXT: vmovaps %ymm7, (%r8)
+; AVX1-NEXT: vmovaps %ymm2, 32(%r9)
+; AVX1-NEXT: vmovaps %ymm11, (%r9)
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX1-NEXT: vmovaps %ymm2, (%rax)
+; AVX1-NEXT: vmovaps %ymm9, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_i64_stride6_vf8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps 352(%rdi), %ymm13
+; AVX2-NEXT: vmovaps 352(%rdi), %ymm0
; AVX2-NEXT: vmovaps 128(%rdi), %ymm4
; AVX2-NEXT: vmovaps 96(%rdi), %ymm9
; AVX2-NEXT: vmovaps 320(%rdi), %ymm2
; AVX2-NEXT: vmovaps 288(%rdi), %ymm7
; AVX2-NEXT: vmovaps 240(%rdi), %xmm6
-; AVX2-NEXT: vmovaps 192(%rdi), %xmm1
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm6[0]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm2[0],ymm7[2],ymm2[2]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 192(%rdi), %xmm8
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm6[0]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm2[0],ymm7[2],ymm2[2]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps (%rdi), %xmm5
-; AVX2-NEXT: vmovaps 16(%rdi), %xmm3
-; AVX2-NEXT: vmovaps 48(%rdi), %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm0[0]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd 104(%rdi), %ymm8
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm0[1]
-; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd 296(%rdi), %ymm0
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
-; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
-; AVX2-NEXT: vmovaps 64(%rdi), %xmm1
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm3[0],xmm1[0]
-; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2]
-; AVX2-NEXT: vmovaps 256(%rdi), %xmm5
-; AVX2-NEXT: vmovaps 208(%rdi), %xmm6
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm6[0],xmm5[0]
-; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vmovaps 160(%rdi), %ymm0
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1]
-; AVX2-NEXT: vmovaps 32(%rdi), %xmm3
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm0[1],ymm9[3],ymm0[3]
+; AVX2-NEXT: vmovaps 16(%rdi), %xmm11
+; AVX2-NEXT: vmovaps 48(%rdi), %xmm10
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm5[0],xmm10[0]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-NEXT: vbroadcastsd 104(%rdi), %ymm12
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm4[1],ymm12[3],ymm4[3]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-NEXT: vbroadcastsd 296(%rdi), %ymm10
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm2[1],ymm10[3],ymm2[3]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm8
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX2-NEXT: vmovaps 64(%rdi), %xmm12
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm12[0]
+; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm10
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
+; AVX2-NEXT: vmovaps 256(%rdi), %xmm13
+; AVX2-NEXT: vmovaps 208(%rdi), %xmm14
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vmovaps 160(%rdi), %ymm15
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX2-NEXT: vmovaps 32(%rdi), %xmm12
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm15[1],ymm9[3],ymm15[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1]
-; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1]
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm13[1],ymm7[3],ymm13[3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-NEXT: vmovaps 80(%rdi), %xmm5
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm3[0],xmm5[0]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2]
+; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm0[1],ymm7[3],ymm0[3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT: vmovaps 80(%rdi), %xmm11
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm15[0],ymm4[2],ymm15[2]
; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT: vmovaps 272(%rdi), %xmm6
-; AVX2-NEXT: vmovaps 224(%rdi), %xmm7
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[2],ymm13[2]
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT: vmovaps 272(%rdi), %xmm13
+; AVX2-NEXT: vmovaps 224(%rdi), %xmm14
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm11 = xmm7[0],xmm6[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm2
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm12[1],xmm11[1]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm11
+; AVX2-NEXT: vbroadcastsd 328(%rdi), %ymm11
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vbroadcastsd 328(%rdi), %ymm3
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm13[1],ymm3[3],ymm13[3]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm6[1]
-; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT: vmovaps %ymm12, (%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, 32(%rsi)
-; AVX2-NEXT: vmovaps %ymm15, 32(%rdx)
-; AVX2-NEXT: vmovaps %ymm14, (%rdx)
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovaps %ymm3, (%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm3, 32(%rsi)
+; AVX2-NEXT: vmovaps %ymm6, 32(%rdx)
+; AVX2-NEXT: vmovaps %ymm5, (%rdx)
; AVX2-NEXT: vmovaps %ymm10, 32(%rcx)
; AVX2-NEXT: vmovaps %ymm8, (%rcx)
-; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
+; AVX2-NEXT: vmovaps %ymm7, 32(%r8)
; AVX2-NEXT: vmovaps %ymm9, (%r8)
-; AVX2-NEXT: vmovaps %ymm2, 32(%r9)
+; AVX2-NEXT: vmovaps %ymm1, 32(%r9)
; AVX2-NEXT: vmovaps %ymm4, (%r9)
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: vmovaps %ymm3, 32(%rax)
-; AVX2-NEXT: vmovaps %ymm0, (%rax)
+; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
+; AVX2-NEXT: vmovaps %ymm2, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
index a5fd21d2b57fb..65b4164c2b1ce 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
@@ -244,119 +244,119 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
; SSE-LABEL: load_i8_stride3_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm8
-; SSE-NEXT: movdqa 16(%rdi), %xmm10
-; SSE-NEXT: movdqa 32(%rdi), %xmm7
+; SSE-NEXT: movdqa (%rdi), %xmm5
+; SSE-NEXT: movdqa 16(%rdi), %xmm4
+; SSE-NEXT: movdqa 32(%rdi), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pandn %xmm10, %xmm1
-; SSE-NEXT: movdqa %xmm8, %xmm2
+; SSE-NEXT: pandn %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm5, %xmm2
; SSE-NEXT: pand %xmm3, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm9, %xmm9
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
+; SSE-NEXT: pxor %xmm6, %xmm6
+; SSE-NEXT: movdqa %xmm2, %xmm7
+; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: movdqa %xmm1, %xmm6
-; SSE-NEXT: pandn %xmm5, %xmm6
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: pandn %xmm7, %xmm8
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: por %xmm8, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,4,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,5,4,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; SSE-NEXT: packuswb %xmm2, %xmm4
+; SSE-NEXT: packuswb %xmm2, %xmm10
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT: pand %xmm2, %xmm4
+; SSE-NEXT: pand %xmm2, %xmm10
+; SSE-NEXT: movdqa %xmm0, %xmm7
+; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT: movdqa %xmm9, %xmm8
+; SSE-NEXT: pandn %xmm7, %xmm8
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE-NEXT: movdqa %xmm0, %xmm11
+; SSE-NEXT: pand %xmm9, %xmm11
+; SSE-NEXT: por %xmm8, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[3,1,2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5]
+; SSE-NEXT: packuswb %xmm8, %xmm11
+; SSE-NEXT: movdqa %xmm2, %xmm8
+; SSE-NEXT: pandn %xmm11, %xmm8
+; SSE-NEXT: por %xmm10, %xmm8
+; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
+; SSE-NEXT: movdqa %xmm4, %xmm11
+; SSE-NEXT: pand %xmm10, %xmm11
+; SSE-NEXT: pandn %xmm5, %xmm10
+; SSE-NEXT: por %xmm11, %xmm10
+; SSE-NEXT: movdqa %xmm10, %xmm11
+; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15]
+; SSE-NEXT: movdqa %xmm9, %xmm12
+; SSE-NEXT: pandn %xmm11, %xmm12
+; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; SSE-NEXT: pand %xmm9, %xmm10
+; SSE-NEXT: por %xmm12, %xmm10
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm11, %xmm10
+; SSE-NEXT: pand %xmm2, %xmm10
; SSE-NEXT: movdqa %xmm7, %xmm11
-; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm6, %xmm5
-; SSE-NEXT: pandn %xmm11, %xmm5
-; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
-; SSE-NEXT: movdqa %xmm7, %xmm0
-; SSE-NEXT: pand %xmm6, %xmm0
-; SSE-NEXT: por %xmm5, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm12
-; SSE-NEXT: pandn %xmm0, %xmm12
-; SSE-NEXT: por %xmm4, %xmm12
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
-; SSE-NEXT: movdqa %xmm10, %xmm4
-; SSE-NEXT: pand %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm8, %xmm0
-; SSE-NEXT: por %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
-; SSE-NEXT: movdqa %xmm6, %xmm5
+; SSE-NEXT: pand %xmm9, %xmm11
+; SSE-NEXT: pandn %xmm0, %xmm9
+; SSE-NEXT: por %xmm11, %xmm9
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
+; SSE-NEXT: packuswb %xmm9, %xmm9
+; SSE-NEXT: pandn %xmm9, %xmm2
+; SSE-NEXT: por %xmm10, %xmm2
+; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: pandn %xmm5, %xmm3
+; SSE-NEXT: por %xmm4, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; SSE-NEXT: pand %xmm5, %xmm3
; SSE-NEXT: pandn %xmm4, %xmm5
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
-; SSE-NEXT: pand %xmm6, %xmm0
-; SSE-NEXT: por %xmm5, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm11, %xmm4
-; SSE-NEXT: pand %xmm6, %xmm4
-; SSE-NEXT: pandn %xmm7, %xmm6
-; SSE-NEXT: por %xmm4, %xmm6
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
-; SSE-NEXT: packuswb %xmm4, %xmm4
-; SSE-NEXT: pandn %xmm4, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm10
-; SSE-NEXT: pandn %xmm8, %xmm3
-; SSE-NEXT: por %xmm10, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pandn %xmm0, %xmm4
-; SSE-NEXT: por %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,0]
+; SSE-NEXT: por %xmm3, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0]
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm0, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm7
-; SSE-NEXT: pandn %xmm11, %xmm1
-; SSE-NEXT: por %xmm7, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: pandn %xmm1, %xmm0
-; SSE-NEXT: por %xmm3, %xmm0
-; SSE-NEXT: movdqa %xmm12, (%rsi)
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm4, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: pandn %xmm7, %xmm1
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: pandn %xmm0, %xmm4
+; SSE-NEXT: por %xmm3, %xmm4
+; SSE-NEXT: movdqa %xmm8, (%rsi)
; SSE-NEXT: movdqa %xmm2, (%rdx)
-; SSE-NEXT: movdqa %xmm0, (%rcx)
+; SSE-NEXT: movdqa %xmm4, (%rcx)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i8_stride3_vf16:
@@ -419,241 +419,239 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
; SSE-LABEL: load_i8_stride3_vf32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa 64(%rdi), %xmm15
-; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 64(%rdi), %xmm2
; SSE-NEXT: movdqa (%rdi), %xmm5
-; SSE-NEXT: movdqa 16(%rdi), %xmm13
-; SSE-NEXT: movdqa 32(%rdi), %xmm2
-; SSE-NEXT: movdqa 48(%rdi), %xmm9
-; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
-; SSE-NEXT: movdqa %xmm14, %xmm6
-; SSE-NEXT: pandn %xmm13, %xmm6
+; SSE-NEXT: movdqa 16(%rdi), %xmm14
+; SSE-NEXT: movdqa 32(%rdi), %xmm4
+; SSE-NEXT: movdqa 48(%rdi), %xmm13
+; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pandn %xmm14, %xmm6
; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
-; SSE-NEXT: movdqa %xmm7, %xmm10
-; SSE-NEXT: pandn %xmm5, %xmm10
-; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: movdqa %xmm7, %xmm8
+; SSE-NEXT: pandn %xmm5, %xmm8
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: pandn %xmm5, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm5, %xmm0
-; SSE-NEXT: pand %xmm14, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: por %xmm6, %xmm0
-; SSE-NEXT: pxor %xmm8, %xmm8
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
+; SSE-NEXT: pxor %xmm10, %xmm10
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: movdqa %xmm5, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSE-NEXT: movdqa %xmm5, %xmm6
+; SSE-NEXT: pandn %xmm3, %xmm6
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
; SSE-NEXT: pand %xmm5, %xmm0
-; SSE-NEXT: por %xmm3, %xmm0
+; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,6,5,4,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; SSE-NEXT: packuswb %xmm0, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT: pand %xmm6, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm11
-; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm11, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm2, %xmm12
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm0, %xmm4
-; SSE-NEXT: por %xmm1, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
-; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm6, %xmm2
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm14, %xmm1
-; SSE-NEXT: pandn %xmm15, %xmm1
-; SSE-NEXT: movdqa %xmm9, %xmm3
-; SSE-NEXT: pand %xmm14, %xmm3
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
-; SSE-NEXT: movdqa %xmm5, %xmm1
-; SSE-NEXT: pandn %xmm4, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,4,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
-; SSE-NEXT: packuswb %xmm3, %xmm1
-; SSE-NEXT: movdqa 80(%rdi), %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm9
-; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
-; SSE-NEXT: movdqa %xmm0, %xmm15
-; SSE-NEXT: pandn %xmm9, %xmm15
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; SSE-NEXT: packuswb %xmm3, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; SSE-NEXT: pand %xmm6, %xmm0
; SSE-NEXT: movdqa %xmm4, %xmm3
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: por %xmm15, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
+; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT: movdqa %xmm15, %xmm9
+; SSE-NEXT: pandn %xmm3, %xmm9
+; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
+; SSE-NEXT: movdqa %xmm4, %xmm11
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm15, %xmm11
+; SSE-NEXT: por %xmm9, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[3,1,2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5]
+; SSE-NEXT: packuswb %xmm9, %xmm11
+; SSE-NEXT: movdqa %xmm6, %xmm9
+; SSE-NEXT: pandn %xmm11, %xmm9
+; SSE-NEXT: por %xmm0, %xmm9
+; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pandn %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm13, %xmm11
+; SSE-NEXT: pand %xmm1, %xmm11
+; SSE-NEXT: por %xmm0, %xmm11
+; SSE-NEXT: movdqa %xmm11, %xmm13
+; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15]
+; SSE-NEXT: movdqa %xmm5, %xmm12
+; SSE-NEXT: pandn %xmm13, %xmm12
+; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; SSE-NEXT: pand %xmm5, %xmm11
+; SSE-NEXT: por %xmm12, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,5,4,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm13[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,3]
+; SSE-NEXT: packuswb %xmm11, %xmm0
+; SSE-NEXT: movdqa 80(%rdi), %xmm13
+; SSE-NEXT: movdqa %xmm13, %xmm9
+; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
+; SSE-NEXT: movdqa %xmm15, %xmm12
+; SSE-NEXT: pandn %xmm9, %xmm12
+; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
+; SSE-NEXT: movdqa %xmm13, %xmm11
+; SSE-NEXT: pand %xmm15, %xmm11
+; SSE-NEXT: por %xmm12, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,1,2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5]
+; SSE-NEXT: packuswb %xmm11, %xmm11
+; SSE-NEXT: movdqa %xmm6, %xmm12
+; SSE-NEXT: pandn %xmm11, %xmm12
+; SSE-NEXT: pand %xmm6, %xmm0
+; SSE-NEXT: por %xmm0, %xmm12
+; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm0
+; SSE-NEXT: por %xmm8, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15]
+; SSE-NEXT: movdqa %xmm15, %xmm11
+; SSE-NEXT: pandn %xmm8, %xmm11
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE-NEXT: pand %xmm15, %xmm0
+; SSE-NEXT: por %xmm11, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,5,6,7,4]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm0, %xmm11
+; SSE-NEXT: movdqa %xmm15, %xmm8
+; SSE-NEXT: pandn %xmm4, %xmm8
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm3, %xmm0
+; SSE-NEXT: pand %xmm15, %xmm3
+; SSE-NEXT: por %xmm8, %xmm3
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4]
; SSE-NEXT: packuswb %xmm3, %xmm3
-; SSE-NEXT: movdqa %xmm6, %xmm15
-; SSE-NEXT: pandn %xmm3, %xmm15
-; SSE-NEXT: pand %xmm6, %xmm1
-; SSE-NEXT: por %xmm1, %xmm15
-; SSE-NEXT: movdqa %xmm13, %xmm1
-; SSE-NEXT: pand %xmm7, %xmm1
-; SSE-NEXT: por %xmm10, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
-; SSE-NEXT: movdqa %xmm0, %xmm10
-; SSE-NEXT: pandn %xmm3, %xmm10
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: por %xmm10, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
+; SSE-NEXT: movdqa %xmm6, %xmm8
+; SSE-NEXT: pandn %xmm3, %xmm8
+; SSE-NEXT: pand %xmm6, %xmm11
+; SSE-NEXT: por %xmm11, %xmm8
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: pandn %xmm2, %xmm7
+; SSE-NEXT: por %xmm3, %xmm7
+; SSE-NEXT: movdqa %xmm7, %xmm3
+; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
+; SSE-NEXT: movdqa %xmm15, %xmm11
+; SSE-NEXT: pandn %xmm3, %xmm11
+; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
+; SSE-NEXT: pand %xmm15, %xmm7
+; SSE-NEXT: por %xmm11, %xmm7
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,6,7,4]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm3, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm10
-; SSE-NEXT: pandn %xmm12, %xmm10
-; SSE-NEXT: movdqa %xmm5, %xmm3
-; SSE-NEXT: pandn %xmm11, %xmm3
-; SSE-NEXT: pand %xmm0, %xmm11
-; SSE-NEXT: por %xmm10, %xmm11
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
-; SSE-NEXT: packuswb %xmm2, %xmm2
-; SSE-NEXT: movdqa %xmm6, %xmm10
-; SSE-NEXT: pandn %xmm2, %xmm10
-; SSE-NEXT: pand %xmm6, %xmm1
-; SSE-NEXT: por %xmm1, %xmm10
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm11, %xmm1
-; SSE-NEXT: pand %xmm7, %xmm1
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; SSE-NEXT: pandn %xmm12, %xmm7
-; SSE-NEXT: por %xmm1, %xmm7
-; SSE-NEXT: movdqa %xmm7, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; SSE-NEXT: pand %xmm0, %xmm7
-; SSE-NEXT: por %xmm2, %xmm7
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm9, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm4, %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: pandn %xmm0, %xmm6
-; SSE-NEXT: por %xmm2, %xmm6
-; SSE-NEXT: pand %xmm14, %xmm13
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm13, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
-; SSE-NEXT: pand %xmm1, %xmm13
-; SSE-NEXT: por %xmm2, %xmm13
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[3,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm0, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: pand %xmm5, %xmm7
-; SSE-NEXT: por %xmm3, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
+; SSE-NEXT: packuswb %xmm3, %xmm7
+; SSE-NEXT: movdqa %xmm9, %xmm3
+; SSE-NEXT: pand %xmm15, %xmm3
+; SSE-NEXT: pandn %xmm13, %xmm15
+; SSE-NEXT: por %xmm3, %xmm15
+; SSE-NEXT: pand %xmm6, %xmm7
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4]
; SSE-NEXT: packuswb %xmm3, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm7
-; SSE-NEXT: pandn %xmm3, %xmm7
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: por %xmm2, %xmm7
-; SSE-NEXT: movdqa %xmm11, %xmm2
-; SSE-NEXT: pand %xmm14, %xmm2
-; SSE-NEXT: pandn %xmm12, %xmm14
-; SSE-NEXT: por %xmm2, %xmm14
-; SSE-NEXT: movdqa %xmm14, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7]
+; SSE-NEXT: pandn %xmm3, %xmm6
+; SSE-NEXT: por %xmm7, %xmm6
; SSE-NEXT: pand %xmm1, %xmm14
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; SSE-NEXT: movdqa %xmm14, %xmm3
+; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,0,65535,65535,0,65535]
+; SSE-NEXT: movdqa %xmm7, %xmm11
+; SSE-NEXT: pandn %xmm3, %xmm11
+; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7]
+; SSE-NEXT: pand %xmm7, %xmm14
+; SSE-NEXT: por %xmm11, %xmm14
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm3, %xmm4
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; SSE-NEXT: pand %xmm5, %xmm11
+; SSE-NEXT: por %xmm0, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm11
+; SSE-NEXT: pandn %xmm0, %xmm11
+; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: por %xmm4, %xmm11
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pandn %xmm2, %xmm1
-; SSE-NEXT: por %xmm14, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
+; SSE-NEXT: pand %xmm7, %xmm1
+; SSE-NEXT: pandn %xmm0, %xmm7
+; SSE-NEXT: por %xmm1, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm2, %xmm1
-; SSE-NEXT: pand %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm0, %xmm1
+; SSE-NEXT: pand %xmm5, %xmm13
; SSE-NEXT: pandn %xmm9, %xmm5
-; SSE-NEXT: por %xmm4, %xmm5
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; SSE-NEXT: packuswb %xmm2, %xmm2
-; SSE-NEXT: pandn %xmm2, %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm15, 16(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, (%rsi)
+; SSE-NEXT: por %xmm13, %xmm5
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: pandn %xmm0, %xmm3
+; SSE-NEXT: por %xmm1, %xmm3
+; SSE-NEXT: movdqa %xmm12, 16(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, (%rsi)
; SSE-NEXT: movdqa %xmm6, 16(%rdx)
-; SSE-NEXT: movdqa %xmm10, (%rdx)
-; SSE-NEXT: movdqa %xmm0, 16(%rcx)
-; SSE-NEXT: movdqa %xmm7, (%rcx)
+; SSE-NEXT: movdqa %xmm8, (%rdx)
+; SSE-NEXT: movdqa %xmm3, 16(%rcx)
+; SSE-NEXT: movdqa %xmm11, (%rcx)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i8_stride3_vf32:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index a7a237edad828..e94af68d3526d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -163,41 +163,41 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm8, %xmm8
-; SSE-NEXT: movdqa %xmm4, %xmm10
-; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3]
+; SSE-NEXT: pxor %xmm7, %xmm7
+; SSE-NEXT: movdqa %xmm4, %xmm2
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm4, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm1, %xmm6
-; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
-; SSE-NEXT: packuswb %xmm7, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,3,2,3]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm2, %xmm4
+; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; SSE-NEXT: packuswb %xmm8, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm8, %xmm4
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
; SSE-NEXT: packuswb %xmm4, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
@@ -282,114 +282,114 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
; SSE-LABEL: load_i8_stride4_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm14
-; SSE-NEXT: movdqa 16(%rdi), %xmm12
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
; SSE-NEXT: movdqa 32(%rdi), %xmm6
-; SSE-NEXT: movdqa 48(%rdi), %xmm7
+; SSE-NEXT: movdqa 48(%rdi), %xmm11
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0]
-; SSE-NEXT: movdqa %xmm7, %xmm3
+; SSE-NEXT: movdqa %xmm11, %xmm3
; SSE-NEXT: pand %xmm0, %xmm3
; SSE-NEXT: movdqa %xmm6, %xmm4
; SSE-NEXT: pand %xmm0, %xmm4
; SSE-NEXT: packuswb %xmm3, %xmm4
-; SSE-NEXT: movdqa %xmm12, %xmm3
+; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: pand %xmm14, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm3, %xmm0
; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: movdqa %xmm7, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm7, %xmm9
-; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE-NEXT: movdqa %xmm6, %xmm10
-; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm6, %xmm11
-; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT: packuswb %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm12, %xmm8
-; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm12, %xmm13
-; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE-NEXT: movdqa %xmm14, %xmm15
-; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
-; SSE-NEXT: movdqa %xmm14, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pxor %xmm9, %xmm9
+; SSE-NEXT: movdqa %xmm11, %xmm3
+; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm1[0,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm11, %xmm4
+; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
+; SSE-NEXT: movdqa %xmm6, %xmm5
+; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[1,3,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[1,3,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
+; SSE-NEXT: packuswb %xmm8, %xmm14
+; SSE-NEXT: movdqa %xmm2, %xmm8
+; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,1,1,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm2, %xmm10
+; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm13[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1]
+; SSE-NEXT: movdqa %xmm1, %xmm13
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm1, %xmm12
+; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
+; SSE-NEXT: packuswb %xmm15, %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm14[0,3]
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm1, %xmm7
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE-NEXT: pand %xmm1, %xmm11
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4]
; SSE-NEXT: pand %xmm1, %xmm6
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
-; SSE-NEXT: packuswb %xmm2, %xmm5
-; SSE-NEXT: pand %xmm1, %xmm12
-; SSE-NEXT: pand %xmm1, %xmm14
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
+; SSE-NEXT: packuswb %xmm11, %xmm6
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; SSE-NEXT: pand %xmm1, %xmm11
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,6,5,4]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
; SSE-NEXT: packuswb %xmm2, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT: # xmm2 = mem[3,1,2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; SSE-NEXT: packuswb %xmm5, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT: packuswb %xmm3, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT: packuswb %xmm5, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3]
+; SSE-NEXT: packuswb %xmm3, %xmm5
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
; SSE-NEXT: movdqa %xmm0, (%rsi)
-; SSE-NEXT: movaps %xmm4, (%rdx)
+; SSE-NEXT: movaps %xmm9, (%rdx)
; SSE-NEXT: movaps %xmm1, (%rcx)
-; SSE-NEXT: movaps %xmm3, (%r8)
+; SSE-NEXT: movaps %xmm5, (%r8)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i8_stride4_vf16:
@@ -406,7 +406,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
@@ -421,22 +421,22 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm6
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm2
+; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm8, (%rsi)
+; AVX1-NEXT: vmovdqa %xmm2, (%rsi)
; AVX1-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX1-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX1-NEXT: vmovdqa %xmm6, (%rcx)
; AVX1-NEXT: vmovdqa %xmm0, (%r8)
; AVX1-NEXT: retq
;
@@ -454,7 +454,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
@@ -469,28 +469,28 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm4
+; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX2-NEXT: vmovdqa %xmm8, (%rsi)
+; AVX2-NEXT: vmovdqa %xmm4, (%rsi)
; AVX2-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX2-NEXT: vmovdqa %xmm4, (%rcx)
+; AVX2-NEXT: vmovdqa %xmm6, (%rcx)
; AVX2-NEXT: vmovdqa %xmm0, (%r8)
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_i8_stride4_vf16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqu64 (%rdi), %zmm8
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512-NEXT: vmovdqa (%rdi), %xmm1
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3
@@ -509,22 +509,22 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm6
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm0
+; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm8
; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm7, %xmm4, %xmm4
+; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
-; AVX512-NEXT: vpmovdb %zmm8, (%rsi)
+; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
; AVX512-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX512-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX512-NEXT: vmovdqa %xmm6, (%rcx)
; AVX512-NEXT: vmovdqa %xmm1, (%r8)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -547,129 +547,130 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-LABEL: load_i8_stride4_vf32:
; SSE: # %bb.0:
; SSE-NEXT: subq $120, %rsp
-; SSE-NEXT: movdqa 64(%rdi), %xmm11
-; SSE-NEXT: movdqa 80(%rdi), %xmm10
+; SSE-NEXT: movdqa 64(%rdi), %xmm4
+; SSE-NEXT: movdqa 80(%rdi), %xmm13
; SSE-NEXT: movdqa 96(%rdi), %xmm15
-; SSE-NEXT: movdqa 112(%rdi), %xmm13
-; SSE-NEXT: movdqa (%rdi), %xmm14
-; SSE-NEXT: movdqa 16(%rdi), %xmm9
-; SSE-NEXT: movdqa 32(%rdi), %xmm7
+; SSE-NEXT: movdqa 112(%rdi), %xmm9
+; SSE-NEXT: movdqa (%rdi), %xmm10
+; SSE-NEXT: movdqa 16(%rdi), %xmm14
+; SSE-NEXT: movdqa 32(%rdi), %xmm8
; SSE-NEXT: movdqa 48(%rdi), %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0]
+; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0]
; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm7, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm6, %xmm0
+; SSE-NEXT: movdqa %xmm8, %xmm1
+; SSE-NEXT: pand %xmm6, %xmm1
; SSE-NEXT: packuswb %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: pand %xmm6, %xmm0
+; SSE-NEXT: movdqa %xmm10, %xmm2
+; SSE-NEXT: pand %xmm6, %xmm2
+; SSE-NEXT: packuswb %xmm0, %xmm2
+; SSE-NEXT: packuswb %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm14, %xmm4
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: packuswb %xmm0, %xmm4
-; SSE-NEXT: packuswb %xmm1, %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm13, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm6, %xmm0
; SSE-NEXT: movdqa %xmm15, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm6, %xmm1
; SSE-NEXT: packuswb %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm10, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pand %xmm11, %xmm2
-; SSE-NEXT: packuswb %xmm0, %xmm2
-; SSE-NEXT: packuswb %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm13, %xmm0
+; SSE-NEXT: pand %xmm6, %xmm0
+; SSE-NEXT: pand %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm0, %xmm6
+; SSE-NEXT: packuswb %xmm1, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm4
; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; SSE-NEXT: movdqa %xmm7, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE-NEXT: movdqa %xmm8, %xmm0
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm7, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; SSE-NEXT: packuswb %xmm5, %xmm6
-; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE-NEXT: movdqa %xmm8, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
+; SSE-NEXT: packuswb %xmm5, %xmm7
+; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm14, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm10, %xmm0
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm10, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; SSE-NEXT: packuswb %xmm11, %xmm5
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm7[0,3]
; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
-; SSE-NEXT: movdqa %xmm14, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[1,3,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm14, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm5[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
-; SSE-NEXT: packuswb %xmm0, %xmm12
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm6[0,3]
-; SSE-NEXT: movdqa %xmm13, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm13, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
; SSE-NEXT: movdqa %xmm15, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[1,3,2,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm15, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
-; SSE-NEXT: packuswb %xmm6, %xmm0
-; SSE-NEXT: movdqa %xmm10, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,1,1,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm10, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
-; SSE-NEXT: movdqa %xmm11, %xmm5
-; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; SSE-NEXT: movdqa %xmm11, %xmm8
-; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,3,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
+; SSE-NEXT: packuswb %xmm7, %xmm0
+; SSE-NEXT: movdqa %xmm13, %xmm1
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm13, %xmm12
+; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
+; SSE-NEXT: movdqa %xmm2, %xmm7
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm2, %xmm11
+; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT: packuswb %xmm1, %xmm4
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3]
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm0, %xmm3
@@ -677,212 +678,212 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE-NEXT: pand %xmm0, %xmm7
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
-; SSE-NEXT: packuswb %xmm1, %xmm3
-; SSE-NEXT: pand %xmm0, %xmm9
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,7,6,5,4]
+; SSE-NEXT: pand %xmm0, %xmm8
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
+; SSE-NEXT: packuswb %xmm1, %xmm2
; SSE-NEXT: pand %xmm0, %xmm14
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4]
+; SSE-NEXT: pand %xmm0, %xmm10
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[1,0,3,2,4,5,6,7]
-; SSE-NEXT: packuswb %xmm6, %xmm9
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm3[0,3]
-; SSE-NEXT: pand %xmm0, %xmm13
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
+; SSE-NEXT: packuswb %xmm3, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3]
+; SSE-NEXT: pand %xmm0, %xmm9
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
; SSE-NEXT: pand %xmm0, %xmm15
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
-; SSE-NEXT: packuswb %xmm3, %xmm6
-; SSE-NEXT: pand %xmm0, %xmm10
-; SSE-NEXT: pand %xmm0, %xmm11
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[1,0,3,2,4,5,6,7]
+; SSE-NEXT: packuswb %xmm2, %xmm8
+; SSE-NEXT: pand %xmm0, %xmm13
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[1,0,3,2,4,5,6,7]
; SSE-NEXT: packuswb %xmm0, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[0,3]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: # xmm6 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; SSE-NEXT: packuswb %xmm6, %xmm7
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: # xmm8 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; SSE-NEXT: packuswb %xmm2, %xmm8
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: # xmm6 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: packuswb %xmm6, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: # xmm1 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: # xmm6 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: # xmm1 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = mem[3,1,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm8[0,3]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: # xmm8 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; SSE-NEXT: # xmm9 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
+; SSE-NEXT: packuswb %xmm8, %xmm9
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; SSE-NEXT: packuswb %xmm6, %xmm7
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: # xmm1 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: # xmm6 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; SSE-NEXT: packuswb %xmm6, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm7[0,3]
-; SSE-NEXT: movdqa %xmm2, 16(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, (%rsi)
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; SSE-NEXT: packuswb %xmm8, %xmm7
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm9[0,3]
+; SSE-NEXT: movdqa %xmm6, 16(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: movaps %xmm2, (%rsi)
; SSE-NEXT: movaps %xmm4, 16(%rdx)
-; SSE-NEXT: movaps %xmm12, (%rdx)
+; SSE-NEXT: movaps %xmm5, (%rdx)
; SSE-NEXT: movaps %xmm3, 16(%rcx)
-; SSE-NEXT: movaps %xmm9, (%rcx)
-; SSE-NEXT: movaps %xmm5, 16(%r8)
+; SSE-NEXT: movaps %xmm1, (%rcx)
+; SSE-NEXT: movaps %xmm7, 16(%r8)
; SSE-NEXT: movaps %xmm0, (%r8)
; SSE-NEXT: addq $120, %rsp
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i8_stride4_vf32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vmovdqa 112(%rdi), %xmm10
-; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm1
-; AVX1-NEXT: vmovdqa 96(%rdi), %xmm11
-; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa 96(%rdi), %xmm2
+; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm3
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vmovdqa 80(%rdi), %xmm13
-; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vmovdqa 80(%rdi), %xmm3
+; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm4
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm6
+; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm6
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8
-; AVX1-NEXT: vmovdqa (%rdi), %xmm12
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm14
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10
+; AVX1-NEXT: vmovdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm1
-; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm1
-; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm2
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm3
-; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm4
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm3
-; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm3
-; AVX1-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm1
-; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm2
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm3
-; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm4
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm3
-; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm3
-; AVX1-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm1, %xmm10, %xmm2
-; AVX1-NEXT: vpshufb %xmm1, %xmm11, %xmm3
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm3, %xmm13, %xmm4
-; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm4
-; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm11
+; AVX1-NEXT: vpshufb %xmm8, %xmm6, %xmm8
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
+; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm11
+; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm9
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm10
+; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm11
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm12
+; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm13
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
+; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm12
+; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm9
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
+; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm12
+; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm11
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm11
+; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm12
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm13
+; AVX1-NEXT: vpshufb %xmm12, %xmm5, %xmm14
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
+; AVX1-NEXT: vpshufb %xmm10, %xmm7, %xmm13
+; AVX1-NEXT: vpshufb %xmm10, %xmm6, %xmm10
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
+; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm13
+; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm12
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm3
+; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm5
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; AVX1-NEXT: vpshufb %xmm3, %xmm14, %xmm4
-; AVX1-NEXT: vpshufb %xmm3, %xmm12, %xmm3
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovaps %ymm8, (%rsi)
; AVX1-NEXT: vmovaps %ymm9, (%rdx)
-; AVX1-NEXT: vmovaps %ymm0, (%rcx)
-; AVX1-NEXT: vmovaps %ymm1, (%r8)
+; AVX1-NEXT: vmovaps %ymm10, (%rcx)
+; AVX1-NEXT: vmovaps %ymm0, (%r8)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_i8_stride4_vf32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa 64(%rdi), %ymm9
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm11
+; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
; AVX2-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
@@ -892,70 +893,70 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm0
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8
; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %ymm7, %ymm11, %ymm8
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm9
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184]
+; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
+; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm8
; AVX2-NEXT: vpermd %ymm8, %ymm6, %ymm8
-; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm7
-; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm7
-; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm1
-; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %ymm1, %ymm11, %ymm7
-; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7
-; AVX2-NEXT: vpshufb %ymm1, %ymm9, %ymm1
-; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm1
-; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm7
-; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %ymm1, %ymm11, %ymm7
-; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7
-; AVX2-NEXT: vpshufb %ymm1, %ymm9, %ymm1
-; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %xmm1, %xmm4, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
+; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
+; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
+; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
+; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
+; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm10
+; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm9
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11
+; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm10
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm11
+; AVX2-NEXT: vpermd %ymm11, %ymm6, %ymm11
+; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm10
+; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
+; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm10, %xmm4, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm3
-; AVX2-NEXT: vpermd %ymm3, %ymm6, %ymm3
-; AVX2-NEXT: vpshufb %ymm2, %ymm9, %ymm2
-; AVX2-NEXT: vpermd %ymm2, %ymm6, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vmovdqa %ymm10, (%rsi)
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqa %ymm7, (%rsi)
; AVX2-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-NEXT: vmovdqa %ymm1, (%r8)
+; AVX2-NEXT: vmovdqa %ymm9, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm0, (%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index 04a5d17e3dbdb..8f9b387227bcf 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -141,60 +141,60 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,3,2,1,4,5,6,7]
-; SSE-NEXT: packuswb %xmm9, %xmm9
-; SSE-NEXT: pxor %xmm8, %xmm8
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
+; SSE-NEXT: packuswb %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm4, %xmm4
; SSE-NEXT: movdqa %xmm3, %xmm6
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE-NEXT: pandn %xmm0, %xmm5
; SSE-NEXT: movdqa %xmm0, %xmm7
; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0]
; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3]
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: pandn %xmm3, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: pandn %xmm3, %xmm8
+; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE-NEXT: movdqa %xmm3, %xmm9
+; SSE-NEXT: psrld $16, %xmm9
+; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3]
; SSE-NEXT: packuswb %xmm6, %xmm6
; SSE-NEXT: por %xmm6, %xmm5
-; SSE-NEXT: movaps %xmm7, %xmm1
-; SSE-NEXT: andps %xmm2, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
-; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,3]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7]
+; SSE-NEXT: movaps %xmm7, %xmm6
+; SSE-NEXT: andps %xmm2, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7]
; SSE-NEXT: packuswb %xmm6, %xmm6
-; SSE-NEXT: por %xmm0, %xmm4
-; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,3]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
+; SSE-NEXT: packuswb %xmm7, %xmm7
+; SSE-NEXT: por %xmm0, %xmm8
+; SSE-NEXT: pand %xmm8, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
; SSE-NEXT: packuswb %xmm2, %xmm2
-; SSE-NEXT: movd %xmm9, (%rsi)
+; SSE-NEXT: movd %xmm1, (%rsi)
; SSE-NEXT: movd %xmm5, (%rdx)
-; SSE-NEXT: movd %xmm1, (%rcx)
-; SSE-NEXT: movd %xmm6, (%r8)
+; SSE-NEXT: movd %xmm6, (%rcx)
+; SSE-NEXT: movd %xmm7, (%r8)
; SSE-NEXT: movd %xmm0, (%r9)
; SSE-NEXT: movd %xmm2, (%rax)
; SSE-NEXT: retq
@@ -314,148 +314,148 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-LABEL: load_i8_stride6_vf8:
; SSE: # %bb.0:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movdqa (%rdi), %xmm8
-; SSE-NEXT: movdqa 16(%rdi), %xmm11
+; SSE-NEXT: movdqa (%rdi), %xmm4
+; SSE-NEXT: movdqa 16(%rdi), %xmm3
; SSE-NEXT: movdqa 32(%rdi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: movdqa %xmm4, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pandn %xmm3, %xmm8
+; SSE-NEXT: por %xmm1, %xmm8
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: movdqa %xmm8, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pandn %xmm11, %xmm2
-; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: pand %xmm13, %xmm1
+; SSE-NEXT: pand %xmm5, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,7,6,7]
-; SSE-NEXT: packuswb %xmm3, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: pand %xmm14, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm12
-; SSE-NEXT: pand %xmm13, %xmm12
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,2,1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
-; SSE-NEXT: packuswb %xmm4, %xmm4
-; SSE-NEXT: movdqa %xmm14, %xmm10
-; SSE-NEXT: pandn %xmm4, %xmm10
-; SSE-NEXT: por %xmm3, %xmm10
-; SSE-NEXT: pxor %xmm9, %xmm9
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,65535]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7]
-; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pandn %xmm3, %xmm6
-; SSE-NEXT: por %xmm2, %xmm6
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,7,6,7]
; SSE-NEXT: packuswb %xmm6, %xmm6
-; SSE-NEXT: pand %xmm14, %xmm6
-; SSE-NEXT: movdqa %xmm0, %xmm15
-; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; SSE-NEXT: packuswb %xmm2, %xmm2
-; SSE-NEXT: pandn %xmm2, %xmm14
-; SSE-NEXT: por %xmm6, %xmm14
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm3, %xmm2
-; SSE-NEXT: pandn %xmm11, %xmm2
-; SSE-NEXT: movdqa %xmm8, %xmm7
-; SSE-NEXT: pand %xmm3, %xmm7
-; SSE-NEXT: por %xmm2, %xmm7
-; SSE-NEXT: movdqa %xmm7, %xmm2
-; SSE-NEXT: pand %xmm13, %xmm2
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,5,5,5,5]
-; SSE-NEXT: packuswb %xmm5, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm6, %xmm5
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,5,6]
-; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm6, %xmm2
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm5, %xmm2
-; SSE-NEXT: movdqa %xmm7, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: pand %xmm1, %xmm6
+; SSE-NEXT: movdqa %xmm0, %xmm7
; SSE-NEXT: pand %xmm5, %xmm7
-; SSE-NEXT: pandn %xmm1, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: packuswb %xmm5, %xmm5
-; SSE-NEXT: pand %xmm6, %xmm5
-; SSE-NEXT: movdqa %xmm15, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
-; SSE-NEXT: movaps %xmm0, %xmm7
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
-; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm6, %xmm7
-; SSE-NEXT: pandn %xmm1, %xmm7
-; SSE-NEXT: por %xmm5, %xmm7
-; SSE-NEXT: pand %xmm3, %xmm11
-; SSE-NEXT: pandn %xmm8, %xmm3
-; SSE-NEXT: por %xmm11, %xmm3
-; SSE-NEXT: pand %xmm3, %xmm13
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: pand %xmm6, %xmm1
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,0,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,6,5]
+; SSE-NEXT: packuswb %xmm9, %xmm9
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pandn %xmm9, %xmm2
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm6, %xmm6
+; SSE-NEXT: movdqa %xmm8, %xmm9
+; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,0,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,65535]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,0,4,5,6,7]
+; SSE-NEXT: pand %xmm10, %xmm8
+; SSE-NEXT: pandn %xmm9, %xmm10
+; SSE-NEXT: por %xmm8, %xmm10
+; SSE-NEXT: packuswb %xmm10, %xmm10
+; SSE-NEXT: pand %xmm1, %xmm10
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; SSE-NEXT: packuswb %xmm9, %xmm9
+; SSE-NEXT: pandn %xmm9, %xmm1
+; SSE-NEXT: por %xmm10, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT: movdqa %xmm11, %xmm9
+; SSE-NEXT: pandn %xmm3, %xmm9
+; SSE-NEXT: movdqa %xmm4, %xmm12
+; SSE-NEXT: pand %xmm11, %xmm12
+; SSE-NEXT: por %xmm9, %xmm12
+; SSE-NEXT: movdqa %xmm12, %xmm9
+; SSE-NEXT: pand %xmm5, %xmm9
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5]
+; SSE-NEXT: packuswb %xmm13, %xmm13
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm9, %xmm13
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,5,6]
+; SSE-NEXT: packuswb %xmm14, %xmm14
+; SSE-NEXT: movdqa %xmm9, %xmm10
+; SSE-NEXT: pandn %xmm14, %xmm10
+; SSE-NEXT: por %xmm13, %xmm10
+; SSE-NEXT: movdqa %xmm12, %xmm13
+; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,1,4,5,6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
+; SSE-NEXT: pand %xmm14, %xmm12
+; SSE-NEXT: pandn %xmm13, %xmm14
+; SSE-NEXT: por %xmm12, %xmm14
+; SSE-NEXT: packuswb %xmm14, %xmm14
+; SSE-NEXT: pand %xmm9, %xmm14
+; SSE-NEXT: movdqa %xmm8, %xmm12
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0]
+; SSE-NEXT: movaps %xmm0, %xmm13
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[0,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,1,0,2]
+; SSE-NEXT: packuswb %xmm13, %xmm13
+; SSE-NEXT: movdqa %xmm9, %xmm12
+; SSE-NEXT: pandn %xmm13, %xmm12
+; SSE-NEXT: por %xmm14, %xmm12
+; SSE-NEXT: pand %xmm11, %xmm3
+; SSE-NEXT: pandn %xmm4, %xmm11
+; SSE-NEXT: por %xmm3, %xmm11
+; SSE-NEXT: pand %xmm11, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7]
; SSE-NEXT: packuswb %xmm4, %xmm4
-; SSE-NEXT: movdqa %xmm6, %xmm5
+; SSE-NEXT: pand %xmm9, %xmm4
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,1,0,2]
+; SSE-NEXT: packuswb %xmm5, %xmm5
+; SSE-NEXT: movdqa %xmm9, %xmm3
+; SSE-NEXT: pandn %xmm5, %xmm3
+; SSE-NEXT: por %xmm4, %xmm3
+; SSE-NEXT: movdqa %xmm11, %xmm4
+; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7]
+; SSE-NEXT: pand %xmm5, %xmm6
; SSE-NEXT: pandn %xmm4, %xmm5
-; SSE-NEXT: por %xmm1, %xmm5
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,65535]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,1,2,4,5,6,7]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm4
-; SSE-NEXT: por %xmm3, %xmm4
-; SSE-NEXT: packuswb %xmm4, %xmm4
-; SSE-NEXT: pand %xmm6, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3]
+; SSE-NEXT: por %xmm6, %xmm5
+; SSE-NEXT: packuswb %xmm5, %xmm5
+; SSE-NEXT: pand %xmm9, %xmm5
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: pandn %xmm0, %xmm6
-; SSE-NEXT: por %xmm4, %xmm6
-; SSE-NEXT: movq %xmm10, (%rsi)
-; SSE-NEXT: movq %xmm14, (%rdx)
-; SSE-NEXT: movq %xmm2, (%rcx)
-; SSE-NEXT: movq %xmm7, (%r8)
-; SSE-NEXT: movq %xmm5, (%r9)
-; SSE-NEXT: movq %xmm6, (%rax)
+; SSE-NEXT: pandn %xmm0, %xmm9
+; SSE-NEXT: por %xmm5, %xmm9
+; SSE-NEXT: movq %xmm2, (%rsi)
+; SSE-NEXT: movq %xmm1, (%rdx)
+; SSE-NEXT: movq %xmm10, (%rcx)
+; SSE-NEXT: movq %xmm12, (%r8)
+; SSE-NEXT: movq %xmm3, (%r9)
+; SSE-NEXT: movq %xmm9, (%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i8_stride6_vf8:
@@ -470,13 +470,13 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[4,10,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm8
+; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm9
+; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5
@@ -485,28 +485,28 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpshufb %xmm6, %xmm8, %xmm8
+; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm9, %xmm8, %xmm8
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm8, (%rsi)
-; AVX1-NEXT: vmovq %xmm9, (%rdx)
+; AVX1-NEXT: vmovq %xmm3, (%rsi)
+; AVX1-NEXT: vmovq %xmm4, (%rdx)
; AVX1-NEXT: vmovq %xmm5, (%rcx)
-; AVX1-NEXT: vmovq %xmm3, (%r8)
-; AVX1-NEXT: vmovq %xmm4, (%r9)
+; AVX1-NEXT: vmovq %xmm7, (%r8)
+; AVX1-NEXT: vmovq %xmm8, (%r9)
; AVX1-NEXT: vmovq %xmm0, (%rax)
; AVX1-NEXT: retq
;
@@ -607,88 +607,86 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
; SSE-LABEL: load_i8_stride6_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa 64(%rdi), %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa (%rdi), %xmm8
-; SSE-NEXT: movdqa 16(%rdi), %xmm5
-; SSE-NEXT: movdqa 32(%rdi), %xmm12
+; SSE-NEXT: movdqa 64(%rdi), %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa (%rdi), %xmm5
+; SSE-NEXT: movdqa 16(%rdi), %xmm11
+; SSE-NEXT: movdqa 32(%rdi), %xmm10
; SSE-NEXT: movdqa 48(%rdi), %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm10, %xmm7
-; SSE-NEXT: pandn %xmm12, %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT: movdqa %xmm3, %xmm7
+; SSE-NEXT: pandn %xmm10, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: movdqa %xmm8, %xmm0
; SSE-NEXT: pandn %xmm6, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm10, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm0
; SSE-NEXT: pandn %xmm6, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm10, %xmm6
+; SSE-NEXT: pand %xmm3, %xmm6
; SSE-NEXT: por %xmm7, %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT: pand %xmm7, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT: pandn %xmm0, %xmm9
-; SSE-NEXT: movdqa %xmm14, %xmm4
-; SSE-NEXT: movdqa %xmm5, %xmm0
-; SSE-NEXT: pandn %xmm5, %xmm4
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: pand %xmm10, %xmm5
-; SSE-NEXT: movdqa %xmm10, %xmm11
-; SSE-NEXT: pandn %xmm0, %xmm11
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT: pandn %xmm0, %xmm4
+; SSE-NEXT: movdqa %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm11, %xmm0
+; SSE-NEXT: pandn %xmm11, %xmm9
+; SSE-NEXT: movdqa %xmm1, %xmm11
+; SSE-NEXT: pand %xmm3, %xmm11
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: pandn %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm5, %xmm14
+; SSE-NEXT: pand %xmm3, %xmm14
+; SSE-NEXT: movdqa 80(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm13
-; SSE-NEXT: movdqa %xmm8, %xmm2
-; SSE-NEXT: pand %xmm10, %xmm2
-; SSE-NEXT: movdqa 80(%rdi), %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm7
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm3, %xmm13
+; SSE-NEXT: movdqa %xmm10, %xmm15
+; SSE-NEXT: pand %xmm3, %xmm10
+; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm10, %xmm7
-; SSE-NEXT: movdqa %xmm12, %xmm15
-; SSE-NEXT: pand %xmm10, %xmm12
-; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm10, %xmm13
-; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm10, %xmm13
-; SSE-NEXT: movdqa %xmm10, %xmm12
-; SSE-NEXT: pandn %xmm8, %xmm10
-; SSE-NEXT: pand %xmm14, %xmm8
-; SSE-NEXT: por %xmm4, %xmm8
-; SSE-NEXT: movdqa %xmm8, %xmm4
-; SSE-NEXT: pand %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,7,6,7]
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT: pand %xmm14, %xmm0
-; SSE-NEXT: por %xmm9, %xmm0
-; SSE-NEXT: pandn %xmm1, %xmm13
-; SSE-NEXT: por %xmm13, %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: pand %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
-; SSE-NEXT: packuswb %xmm4, %xmm13
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm12
+; SSE-NEXT: pandn %xmm5, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: por %xmm9, %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm9
+; SSE-NEXT: pand %xmm7, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7]
+; SSE-NEXT: packuswb %xmm9, %xmm9
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9
+; SSE-NEXT: por %xmm4, %xmm9
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: por %xmm1, %xmm11
+; SSE-NEXT: movdqa %xmm11, %xmm1
+; SSE-NEXT: pand %xmm7, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
+; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; SSE-NEXT: movdqa %xmm4, %xmm1
-; SSE-NEXT: pandn %xmm13, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm9
+; SSE-NEXT: por %xmm9, %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pxor %xmm13, %xmm13
+; SSE-NEXT: pxor %xmm9, %xmm9
; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: psrld $16, %xmm0
@@ -696,28 +694,29 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
; SSE-NEXT: packuswb %xmm6, %xmm1
-; SSE-NEXT: movdqa %xmm8, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[1,3,2,0,4,5,6,7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,65535]
-; SSE-NEXT: pand %xmm6, %xmm8
+; SSE-NEXT: pand %xmm6, %xmm5
; SSE-NEXT: pandn %xmm0, %xmm6
-; SSE-NEXT: por %xmm8, %xmm6
+; SSE-NEXT: por %xmm5, %xmm6
; SSE-NEXT: packuswb %xmm6, %xmm6
-; SSE-NEXT: pand %xmm14, %xmm6
-; SSE-NEXT: pandn %xmm1, %xmm14
-; SSE-NEXT: por %xmm14, %xmm6
-; SSE-NEXT: movdqa %xmm5, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: pandn %xmm1, %xmm8
+; SSE-NEXT: por %xmm8, %xmm6
+; SSE-NEXT: movdqa %xmm11, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535]
@@ -725,14 +724,14 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pandn %xmm0, %xmm5
; SSE-NEXT: por %xmm1, %xmm5
; SSE-NEXT: packuswb %xmm5, %xmm0
-; SSE-NEXT: movdqa %xmm4, %xmm8
-; SSE-NEXT: pandn %xmm0, %xmm8
+; SSE-NEXT: movdqa %xmm4, %xmm11
+; SSE-NEXT: pandn %xmm0, %xmm11
; SSE-NEXT: pand %xmm4, %xmm6
-; SSE-NEXT: por %xmm6, %xmm8
+; SSE-NEXT: por %xmm6, %xmm11
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15
; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; SSE-NEXT: movdqa %xmm15, %xmm0
-; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm0
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
@@ -740,85 +739,83 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: por %xmm11, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: por %xmm2, %xmm14
+; SSE-NEXT: movdqa %xmm14, %xmm1
+; SSE-NEXT: pand %xmm7, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: pandn %xmm1, %xmm6
-; SSE-NEXT: pand %xmm5, %xmm0
-; SSE-NEXT: por %xmm0, %xmm6
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT: pandn %xmm9, %xmm12
-; SSE-NEXT: por %xmm12, %xmm7
-; SSE-NEXT: movdqa %xmm7, %xmm0
-; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: pandn %xmm1, %xmm5
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: por %xmm0, %xmm5
+; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT: por %xmm12, %xmm13
+; SSE-NEXT: movdqa %xmm13, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movdqa %xmm4, %xmm11
-; SSE-NEXT: pandn %xmm0, %xmm11
-; SSE-NEXT: pand %xmm4, %xmm6
-; SSE-NEXT: por %xmm6, %xmm11
+; SSE-NEXT: movdqa %xmm4, %xmm8
+; SSE-NEXT: pandn %xmm0, %xmm8
+; SSE-NEXT: pand %xmm4, %xmm5
+; SSE-NEXT: por %xmm5, %xmm8
; SSE-NEXT: movdqa %xmm15, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
-; SSE-NEXT: movdqa %xmm15, %xmm6
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[3,0]
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,2]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15]
+; SSE-NEXT: movdqa %xmm15, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSE-NEXT: movaps %xmm0, %xmm5
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
; SSE-NEXT: packuswb %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,3,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: pand %xmm6, %xmm2
+; SSE-NEXT: pand %xmm6, %xmm5
; SSE-NEXT: pandn %xmm0, %xmm6
-; SSE-NEXT: por %xmm2, %xmm6
-; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: por %xmm5, %xmm6
+; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: packuswb %xmm6, %xmm6
-; SSE-NEXT: pandn %xmm6, %xmm5
-; SSE-NEXT: por %xmm1, %xmm5
-; SSE-NEXT: movdqa %xmm7, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
+; SSE-NEXT: pandn %xmm6, %xmm2
+; SSE-NEXT: por %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm13, %xmm0
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,0,3]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm5
-; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0]
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pandn %xmm0, %xmm5
+; SSE-NEXT: por %xmm1, %xmm5
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm5, %xmm0
; SSE-NEXT: pandn %xmm0, %xmm4
-; SSE-NEXT: por %xmm5, %xmm4
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: por %xmm2, %xmm4
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT: movdqa %xmm10, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
@@ -827,9 +824,9 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm10, %xmm0
-; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
@@ -838,61 +835,62 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pand %xmm2, %xmm5
; SSE-NEXT: por %xmm1, %xmm5
; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: pand %xmm12, %xmm9
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: pand %xmm12, %xmm1
; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
; SSE-NEXT: pand %xmm0, %xmm5
-; SSE-NEXT: por %xmm9, %xmm12
-; SSE-NEXT: pand %xmm12, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3]
+; SSE-NEXT: por %xmm1, %xmm12
+; SSE-NEXT: pand %xmm12, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,1,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: movdqa %xmm6, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
-; SSE-NEXT: movdqa %xmm6, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm1[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[2,3]
+; SSE-NEXT: movdqa %xmm0, %xmm7
+; SSE-NEXT: pandn %xmm1, %xmm7
+; SSE-NEXT: por %xmm5, %xmm7
+; SSE-NEXT: movdqa %xmm10, %xmm1
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSE-NEXT: movdqa %xmm10, %xmm5
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm1[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[2,3]
; SSE-NEXT: psrlq $48, %xmm1
; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7]
; SSE-NEXT: packuswb %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm10, %xmm5
-; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
+; SSE-NEXT: movdqa %xmm3, %xmm5
+; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,1,2,4,5,6,7]
-; SSE-NEXT: pand %xmm6, %xmm7
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,1,2,4,5,6,7]
+; SSE-NEXT: pand %xmm6, %xmm3
; SSE-NEXT: pandn %xmm5, %xmm6
-; SSE-NEXT: por %xmm7, %xmm6
+; SSE-NEXT: por %xmm3, %xmm6
; SSE-NEXT: packuswb %xmm6, %xmm6
; SSE-NEXT: pand %xmm2, %xmm6
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: por %xmm2, %xmm6
; SSE-NEXT: movdqa %xmm12, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,0,0]
; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4]
-; SSE-NEXT: pandn %xmm5, %xmm2
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,4]
+; SSE-NEXT: pandn %xmm3, %xmm2
; SSE-NEXT: por %xmm1, %xmm2
; SSE-NEXT: pand %xmm0, %xmm6
; SSE-NEXT: packuswb %xmm2, %xmm1
@@ -900,10 +898,10 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, (%rsi)
-; SSE-NEXT: movdqa %xmm8, (%rdx)
-; SSE-NEXT: movdqa %xmm11, (%rcx)
+; SSE-NEXT: movdqa %xmm11, (%rdx)
+; SSE-NEXT: movdqa %xmm8, (%rcx)
; SSE-NEXT: movdqa %xmm4, (%r8)
-; SSE-NEXT: movdqa %xmm3, (%r9)
+; SSE-NEXT: movdqa %xmm7, (%r9)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movdqa %xmm0, (%rax)
; SSE-NEXT: retq
@@ -921,86 +919,86 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[2,8,14,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7]
; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4
; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[4,10]
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; AVX1-NEXT: vpblendvb %xmm10, %xmm8, %xmm6, %xmm8
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3,4,5],xmm6[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm10, %xmm9, %xmm6, %xmm9
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm7[0],xmm6[0]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u>
-; AVX1-NEXT: vpblendvb %xmm12, %xmm11, %xmm6, %xmm11
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,6,12]
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm10, %xmm11, %xmm6, %xmm11
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm6[0],xmm7[0]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm12, %xmm13, %xmm6, %xmm12
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,7,13]
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpblendvb %xmm10, %xmm12, %xmm6, %xmm10
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm12
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u>
-; AVX1-NEXT: vpblendvb %xmm13, %xmm12, %xmm6, %xmm12
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3,4],xmm6[5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
+; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT: vpblendvb %xmm9, %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm8
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
+; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm8
+; AVX1-NEXT: vpblendvb %xmm9, %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm10, %xmm11, %xmm10
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u>
+; AVX1-NEXT: vpblendvb %xmm11, %xmm8, %xmm10, %xmm8
+; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,6,12]
+; AVX1-NEXT: vpor %xmm10, %xmm12, %xmm10
+; AVX1-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm8
+; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm12, %xmm13, %xmm12
+; AVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm12, %xmm10
+; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,7,13]
+; AVX1-NEXT: vpor %xmm11, %xmm12, %xmm11
+; AVX1-NEXT: vpblendvb %xmm9, %xmm10, %xmm11, %xmm9
+; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm10, %xmm11, %xmm10
+; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u>
+; AVX1-NEXT: vpblendvb %xmm12, %xmm10, %xmm11, %xmm10
+; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
+; AVX1-NEXT: vpor %xmm11, %xmm13, %xmm11
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; AVX1-NEXT: vpblendvb %xmm13, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendvb %xmm12, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[3,9,15]
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero
; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
-; AVX1-NEXT: vmovdqa %xmm8, (%rsi)
-; AVX1-NEXT: vmovdqa %xmm9, (%rdx)
-; AVX1-NEXT: vmovdqa %xmm11, (%rcx)
-; AVX1-NEXT: vmovdqa %xmm10, (%r8)
-; AVX1-NEXT: vmovdqa %xmm6, (%r9)
+; AVX1-NEXT: vmovdqa %xmm6, (%rsi)
+; AVX1-NEXT: vmovdqa %xmm7, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm8, (%rcx)
+; AVX1-NEXT: vmovdqa %xmm9, (%r8)
+; AVX1-NEXT: vmovdqa %xmm10, (%r9)
; AVX1-NEXT: vmovdqa %xmm0, (%rax)
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_i8_stride6_vf16:
; AVX2: # %bb.0:
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: vmovdqa (%rdi), %ymm8
+; AVX2-NEXT: vmovdqa (%rdi), %ymm3
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
-; AVX2-NEXT: vpblendvb %ymm0, %ymm8, %ymm4, %ymm5
+; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
@@ -1008,247 +1006,247 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10]
; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
-; AVX2-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; AVX2-NEXT: vpblendvb %xmm11, %xmm2, %xmm3, %xmm9
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
-; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX2-NEXT: vpblendvb %xmm11, %xmm3, %xmm5, %xmm10
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
-; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm8, %ymm3
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm2, %xmm5, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12]
-; AVX2-NEXT: vpor %xmm5, %xmm7, %xmm5
-; AVX2-NEXT: vpblendvb %xmm11, %xmm2, %xmm5, %xmm12
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
+; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX2-NEXT: vpblendvb %xmm11, %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
-; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm8, %ymm4
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm5[0,6,12],zero,zero,zero,xmm5[4,10,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[4,10],zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
-; AVX2-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[1,7,13],zero,zero,zero,xmm5[5,11,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[5,11],zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[2,8,14],zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm9, %xmm10, %xmm9
+; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12]
+; AVX2-NEXT: vpor %xmm10, %xmm11, %xmm10
+; AVX2-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9
+; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6
+; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13]
+; AVX2-NEXT: vpor %xmm7, %xmm10, %xmm7
+; AVX2-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
+; AVX2-NEXT: vpor %xmm8, %xmm10, %xmm8
+; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15]
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero
; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7]
-; AVX2-NEXT: vmovdqa %xmm9, (%rsi)
-; AVX2-NEXT: vmovdqa %xmm10, (%rdx)
-; AVX2-NEXT: vmovdqa %xmm12, (%rcx)
-; AVX2-NEXT: vmovdqa %xmm3, (%r8)
-; AVX2-NEXT: vmovdqa %xmm2, (%r9)
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
+; AVX2-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX2-NEXT: vmovdqa %xmm5, (%rdx)
+; AVX2-NEXT: vmovdqa %xmm9, (%rcx)
+; AVX2-NEXT: vmovdqa %xmm6, (%r8)
+; AVX2-NEXT: vmovdqa %xmm7, (%r9)
; AVX2-NEXT: vmovdqa %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_i8_stride6_vf16:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa (%rdi), %xmm5
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512-NEXT: vpextrb $2, %xmm2, %r10d
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[0,6,12],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512-NEXT: vpextrb $4, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512-NEXT: vpextrb $10, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512-NEXT: vpextrb $6, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $10, %eax, %xmm3, %xmm4
+; AVX512-NEXT: vpinsrb $3, %r10d, %xmm3, %xmm3
+; AVX512-NEXT: vpextrb $8, %xmm2, %r10d
+; AVX512-NEXT: vpinsrb $4, %r10d, %xmm3, %xmm3
+; AVX512-NEXT: vpextrb $14, %xmm2, %r10d
+; AVX512-NEXT: vpinsrb $5, %r10d, %xmm3, %xmm3
+; AVX512-NEXT: vpextrb $4, %xmm1, %r10d
+; AVX512-NEXT: vpinsrb $6, %r10d, %xmm3, %xmm3
+; AVX512-NEXT: vpextrb $10, %xmm1, %r10d
+; AVX512-NEXT: vpinsrb $7, %r10d, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %r10d
+; AVX512-NEXT: vpinsrb $8, %r10d, %xmm3, %xmm3
+; AVX512-NEXT: vpextrb $6, %xmm0, %r10d
+; AVX512-NEXT: vpinsrb $9, %r10d, %xmm3, %xmm3
+; AVX512-NEXT: vpextrb $12, %xmm0, %r10d
+; AVX512-NEXT: vpinsrb $10, %r10d, %xmm3, %xmm4
; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512-NEXT: vpextrb $2, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512-NEXT: vpextrb $8, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512-NEXT: vpextrb $14, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $13, %eax, %xmm4, %xmm6
+; AVX512-NEXT: vpextrb $2, %xmm3, %r10d
+; AVX512-NEXT: vpinsrb $11, %r10d, %xmm4, %xmm4
+; AVX512-NEXT: vpextrb $8, %xmm3, %r10d
+; AVX512-NEXT: vpinsrb $12, %r10d, %xmm4, %xmm4
+; AVX512-NEXT: vpextrb $14, %xmm3, %r10d
+; AVX512-NEXT: vpinsrb $13, %r10d, %xmm4, %xmm6
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512-NEXT: vpextrb $4, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $10, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $15, %eax, %xmm6, %xmm8
-; AVX512-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,7,13],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $5, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $11, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $1, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $13, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $3, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $9, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $15, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $5, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $11, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $15, %eax, %xmm6, %xmm9
-; AVX512-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[2,8,14],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $6, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $12, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $2, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $14, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $4, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $10, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vmovd %xmm4, %eax
-; AVX512-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $6, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $12, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $15, %eax, %xmm6, %xmm10
-; AVX512-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[3,9,15],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpinsrb $3, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512-NEXT: vpinsrb $4, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $1, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $5, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $7, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $6, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $13, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $7, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $8, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $9, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $9, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $5, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $11, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $1, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $7, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $13, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $15, %eax, %xmm7, %xmm7
-; AVX512-NEXT: vpextrb $10, %xmm5, %eax
-; AVX512-NEXT: vpextrb $4, %xmm5, %edi
-; AVX512-NEXT: vmovd %edi, %xmm6
-; AVX512-NEXT: vpinsrb $1, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vmovd %xmm2, %eax
-; AVX512-NEXT: vpinsrb $2, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $2, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $8, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $14, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $10, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vmovd %xmm3, %eax
-; AVX512-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $6, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $12, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $2, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $8, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $14, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6
-; AVX512-NEXT: vpextrb $11, %xmm5, %eax
-; AVX512-NEXT: vpextrb $5, %xmm5, %edi
-; AVX512-NEXT: vmovd %edi, %xmm5
-; AVX512-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512-NEXT: vpinsrb $4, %eax, %xmm5, %xmm2
-; AVX512-NEXT: vpextrb $3, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512-NEXT: vpextrb $9, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512-NEXT: vpextrb $15, %xmm1, %eax
-; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm1
-; AVX512-NEXT: vpextrb $5, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm0
-; AVX512-NEXT: vpextrb $1, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $7, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $13, %xmm3, %eax
-; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $3, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $9, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $15, %xmm4, %eax
-; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa %xmm8, (%rsi)
-; AVX512-NEXT: vmovdqa %xmm9, (%rdx)
-; AVX512-NEXT: vmovdqa %xmm10, (%rcx)
-; AVX512-NEXT: vmovdqa %xmm7, (%r8)
-; AVX512-NEXT: vmovdqa %xmm6, (%r9)
-; AVX512-NEXT: vmovdqa %xmm0, (%r10)
+; AVX512-NEXT: vpextrb $4, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $14, %edi, %xmm6, %xmm6
+; AVX512-NEXT: vpextrb $10, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $15, %edi, %xmm6, %xmm6
+; AVX512-NEXT: vpextrb $3, %xmm2, %edi
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[1,7,13],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpinsrb $3, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $9, %xmm2, %edi
+; AVX512-NEXT: vpinsrb $4, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $15, %xmm2, %edi
+; AVX512-NEXT: vpinsrb $5, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $5, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $6, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $11, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $7, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $1, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $8, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $7, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $9, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $13, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $10, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $3, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $11, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $9, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $12, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $15, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $13, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $5, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $14, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $11, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $15, %edi, %xmm7, %xmm7
+; AVX512-NEXT: vpextrb $4, %xmm2, %edi
+; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[2,8,14],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpinsrb $3, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vpextrb $10, %xmm2, %edi
+; AVX512-NEXT: vpinsrb $4, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vmovd %xmm1, %edi
+; AVX512-NEXT: vpinsrb $5, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vpextrb $6, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $6, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vpextrb $12, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $7, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vpextrb $2, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $8, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vpextrb $8, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $9, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vpextrb $14, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $10, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vpextrb $4, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $11, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vpextrb $10, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $12, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vmovd %xmm4, %edi
+; AVX512-NEXT: vpinsrb $13, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vpextrb $6, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $14, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vpextrb $12, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $15, %edi, %xmm8, %xmm8
+; AVX512-NEXT: vpextrb $5, %xmm2, %edi
+; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[3,9,15],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpinsrb $3, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $11, %xmm2, %edi
+; AVX512-NEXT: vpinsrb $4, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $1, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $5, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $7, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $6, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $13, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $7, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $3, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $8, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $9, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $9, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $15, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $10, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $5, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $11, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $11, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $12, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $1, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $13, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $7, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $14, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $13, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $15, %edi, %xmm9, %xmm9
+; AVX512-NEXT: vpextrb $10, %xmm5, %edi
+; AVX512-NEXT: vpextrb $4, %xmm5, %r10d
+; AVX512-NEXT: vmovd %r10d, %xmm10
+; AVX512-NEXT: vpinsrb $1, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vmovd %xmm2, %edi
+; AVX512-NEXT: vpinsrb $2, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $6, %xmm2, %edi
+; AVX512-NEXT: vpinsrb $3, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $12, %xmm2, %edi
+; AVX512-NEXT: vpinsrb $4, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $2, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $5, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $8, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $6, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $14, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $7, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $4, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $8, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $10, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $9, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vmovd %xmm3, %edi
+; AVX512-NEXT: vpinsrb $10, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $6, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $11, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $12, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $12, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $2, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $13, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $8, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $14, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $14, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $15, %edi, %xmm10, %xmm10
+; AVX512-NEXT: vpextrb $11, %xmm5, %edi
+; AVX512-NEXT: vpextrb $5, %xmm5, %r10d
+; AVX512-NEXT: vmovd %r10d, %xmm5
+; AVX512-NEXT: vpinsrb $1, %edi, %xmm5, %xmm5
+; AVX512-NEXT: vpextrb $1, %xmm2, %edi
+; AVX512-NEXT: vpinsrb $2, %edi, %xmm5, %xmm5
+; AVX512-NEXT: vpextrb $7, %xmm2, %edi
+; AVX512-NEXT: vpinsrb $3, %edi, %xmm5, %xmm5
+; AVX512-NEXT: vpextrb $13, %xmm2, %edi
+; AVX512-NEXT: vpinsrb $4, %edi, %xmm5, %xmm2
+; AVX512-NEXT: vpextrb $3, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $5, %edi, %xmm2, %xmm2
+; AVX512-NEXT: vpextrb $9, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $6, %edi, %xmm2, %xmm2
+; AVX512-NEXT: vpextrb $15, %xmm1, %edi
+; AVX512-NEXT: vpinsrb $7, %edi, %xmm2, %xmm1
+; AVX512-NEXT: vpextrb $5, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $8, %edi, %xmm1, %xmm1
+; AVX512-NEXT: vpextrb $11, %xmm0, %edi
+; AVX512-NEXT: vpinsrb $9, %edi, %xmm1, %xmm0
+; AVX512-NEXT: vpextrb $1, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $7, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $13, %xmm3, %edi
+; AVX512-NEXT: vpinsrb $12, %edi, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $3, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $13, %edi, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $9, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $14, %edi, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $15, %xmm4, %edi
+; AVX512-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm6, (%rsi)
+; AVX512-NEXT: vmovdqa %xmm7, (%rdx)
+; AVX512-NEXT: vmovdqa %xmm8, (%rcx)
+; AVX512-NEXT: vmovdqa %xmm9, (%r8)
+; AVX512-NEXT: vmovdqa %xmm10, (%r9)
+; AVX512-NEXT: vmovdqa %xmm0, (%rax)
; AVX512-NEXT: retq
%wide.vec = load <96 x i8>, ptr %in.vec, align 32
@@ -1272,143 +1270,145 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
; SSE-LABEL: load_i8_stride6_vf32:
; SSE: # %bb.0:
-; SSE-NEXT: subq $264, %rsp # imm = 0x108
-; SSE-NEXT: movdqa 64(%rdi), %xmm9
-; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 80(%rdi), %xmm14
-; SSE-NEXT: movdqa (%rdi), %xmm13
-; SSE-NEXT: movdqa 16(%rdi), %xmm8
+; SSE-NEXT: subq $280, %rsp # imm = 0x118
+; SSE-NEXT: movdqa 64(%rdi), %xmm8
+; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 80(%rdi), %xmm6
+; SSE-NEXT: movdqa (%rdi), %xmm15
+; SSE-NEXT: movdqa 16(%rdi), %xmm13
; SSE-NEXT: movdqa 32(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 48(%rdi), %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: pandn %xmm1, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm1, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: movdqa %xmm10, %xmm1
; SSE-NEXT: pandn %xmm7, %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm5, %xmm1
; SSE-NEXT: pandn %xmm7, %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm7, %xmm15
-; SSE-NEXT: pand %xmm4, %xmm15
-; SSE-NEXT: por %xmm5, %xmm15
-; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: movdqa %xmm15, %xmm1
-; SSE-NEXT: pand %xmm12, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
-; SSE-NEXT: packuswb %xmm2, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT: movdqa %xmm3, %xmm2
-; SSE-NEXT: pandn %xmm8, %xmm2
-; SSE-NEXT: movdqa %xmm13, %xmm11
-; SSE-NEXT: pand %xmm3, %xmm11
-; SSE-NEXT: movdqa %xmm3, %xmm6
-; SSE-NEXT: por %xmm2, %xmm11
-; SSE-NEXT: movdqa %xmm11, %xmm2
-; SSE-NEXT: pand %xmm12, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
-; SSE-NEXT: packuswb %xmm2, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm2
-; SSE-NEXT: movdqa %xmm7, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm4, %xmm1
-; SSE-NEXT: pandn %xmm14, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm9
-; SSE-NEXT: por %xmm1, %xmm9
-; SSE-NEXT: movdqa %xmm9, %xmm1
-; SSE-NEXT: pand %xmm12, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
+; SSE-NEXT: movdqa %xmm7, %xmm14
+; SSE-NEXT: pand %xmm5, %xmm14
+; SSE-NEXT: por %xmm0, %xmm14
+; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: pand %xmm11, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT: movdqa %xmm10, %xmm1
+; SSE-NEXT: pandn %xmm13, %xmm1
+; SSE-NEXT: movdqa %xmm15, %xmm12
+; SSE-NEXT: pand %xmm10, %xmm12
+; SSE-NEXT: por %xmm1, %xmm12
+; SSE-NEXT: movdqa %xmm12, %xmm1
+; SSE-NEXT: pand %xmm11, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT: movdqa %xmm5, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm2
-; SSE-NEXT: por %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 128(%rdi), %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm4, %xmm2
+; SSE-NEXT: pand %xmm9, %xmm1
+; SSE-NEXT: movdqa %xmm9, %xmm2
; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: movdqa 144(%rdi), %xmm1
-; SSE-NEXT: movdqa %xmm6, %xmm5
-; SSE-NEXT: movdqa %xmm6, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm4, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pand %xmm12, %xmm2
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pandn %xmm2, %xmm7
-; SSE-NEXT: movdqa %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm4, %xmm3
-; SSE-NEXT: pandn %xmm13, %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm6, %xmm0
+; SSE-NEXT: pand %xmm5, %xmm8
+; SSE-NEXT: por %xmm0, %xmm8
+; SSE-NEXT: movdqa %xmm8, %xmm0
+; SSE-NEXT: pand %xmm11, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: pandn %xmm0, %xmm2
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: por %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 128(%rdi), %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm1, %xmm0
+; SSE-NEXT: movdqa 144(%rdi), %xmm3
+; SSE-NEXT: movdqa %xmm10, %xmm2
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm5, %xmm2
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm5, %xmm3
+; SSE-NEXT: por %xmm0, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: pand %xmm11, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: pandn %xmm0, %xmm9
+; SSE-NEXT: movdqa %xmm5, %xmm1
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm15, %xmm0
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 112(%rdi), %xmm0
-; SSE-NEXT: pandn %xmm0, %xmm6
-; SSE-NEXT: movdqa 160(%rdi), %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pandn %xmm8, %xmm4
-; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
-; SSE-NEXT: pand %xmm2, %xmm13
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: pandn %xmm14, %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm2, %xmm14
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm0, %xmm4
+; SSE-NEXT: movdqa %xmm10, %xmm7
+; SSE-NEXT: pandn %xmm0, %xmm7
+; SSE-NEXT: movdqa 160(%rdi), %xmm4
; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm5, %xmm4
+; SSE-NEXT: movdqa %xmm13, %xmm2
+; SSE-NEXT: pandn %xmm13, %xmm5
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm1, %xmm15
+; SSE-NEXT: movdqa %xmm10, %xmm5
+; SSE-NEXT: pandn %xmm6, %xmm5
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm1, %xmm6
+; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm1, %xmm5
+; SSE-NEXT: pandn %xmm0, %xmm5
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, %xmm5
; SSE-NEXT: movdqa 96(%rdi), %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 176(%rdi), %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm10, %xmm4
-; SSE-NEXT: pand %xmm2, %xmm10
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pand %xmm1, %xmm6
+; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 176(%rdi), %xmm13
+; SSE-NEXT: movdqa %xmm13, %xmm10
+; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm1, %xmm10
; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm2, %xmm8
-; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm10, %xmm8
-; SSE-NEXT: pand %xmm2, %xmm10
+; SSE-NEXT: movdqa %xmm10, %xmm6
+; SSE-NEXT: pand %xmm1, %xmm10
; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm2, %xmm5
-; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm2, %xmm10
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pandn %xmm0, %xmm2
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm2, %xmm10
+; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm1, %xmm5
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: por %xmm6, %xmm0
+; SSE-NEXT: por %xmm7, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: pand %xmm12, %xmm5
+; SSE-NEXT: pand %xmm11, %xmm5
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
@@ -1416,120 +1416,119 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
; SSE-NEXT: packuswb %xmm5, %xmm5
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; SSE-NEXT: por %xmm10, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm2
-; SSE-NEXT: pand %xmm12, %xmm2
+; SSE-NEXT: por %xmm9, %xmm5
+; SSE-NEXT: pandn %xmm13, %xmm2
+; SSE-NEXT: por %xmm2, %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm2
+; SSE-NEXT: pand %xmm11, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
; SSE-NEXT: packuswb %xmm2, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT: movdqa %xmm6, %xmm7
-; SSE-NEXT: pandn %xmm2, %xmm7
-; SSE-NEXT: pand %xmm6, %xmm5
-; SSE-NEXT: por %xmm5, %xmm7
-; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pxor %xmm10, %xmm10
-; SSE-NEXT: movdqa %xmm15, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,2,3,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; SSE-NEXT: psrld $16, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,1,0,3]
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: pandn %xmm2, %xmm1
+; SSE-NEXT: pand %xmm9, %xmm5
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: movdqa %xmm14, %xmm5
+; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,3,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; SSE-NEXT: psrld $16, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm5[2],xmm14[3],xmm5[3]
+; SSE-NEXT: packuswb %xmm14, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT: movdqa %xmm13, %xmm5
+; SSE-NEXT: pandn %xmm7, %xmm5
+; SSE-NEXT: movdqa %xmm12, %xmm7
+; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3]
-; SSE-NEXT: packuswb %xmm7, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT: movdqa %xmm6, %xmm2
-; SSE-NEXT: pandn %xmm5, %xmm2
-; SSE-NEXT: movdqa %xmm11, %xmm5
-; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,0,0,65535,65535]
-; SSE-NEXT: movdqa %xmm15, %xmm7
-; SSE-NEXT: pandn %xmm5, %xmm7
-; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
-; SSE-NEXT: pand %xmm15, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: packuswb %xmm5, %xmm5
-; SSE-NEXT: pand %xmm6, %xmm5
-; SSE-NEXT: por %xmm2, %xmm5
-; SSE-NEXT: movdqa %xmm9, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,5,5,5,5]
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535]
-; SSE-NEXT: movdqa %xmm7, %xmm2
-; SSE-NEXT: pandn %xmm11, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,4]
-; SSE-NEXT: pand %xmm7, %xmm6
-; SSE-NEXT: por %xmm2, %xmm6
-; SSE-NEXT: packuswb %xmm6, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT: movdqa %xmm11, %xmm6
-; SSE-NEXT: pandn %xmm2, %xmm6
-; SSE-NEXT: pand %xmm11, %xmm5
-; SSE-NEXT: por %xmm5, %xmm6
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; SSE-NEXT: psrld $16, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,0,0,65535,65535]
+; SSE-NEXT: movdqa %xmm14, %xmm1
+; SSE-NEXT: pandn %xmm7, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,0,4,5,6,7]
+; SSE-NEXT: pand %xmm14, %xmm7
+; SSE-NEXT: por %xmm1, %xmm7
+; SSE-NEXT: packuswb %xmm7, %xmm7
+; SSE-NEXT: pand %xmm13, %xmm7
+; SSE-NEXT: por %xmm5, %xmm7
+; SSE-NEXT: movdqa %xmm8, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535]
+; SSE-NEXT: movdqa %xmm5, %xmm12
+; SSE-NEXT: pandn %xmm1, %xmm12
+; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE-NEXT: packuswb %xmm1, %xmm5
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: por %xmm12, %xmm1
+; SSE-NEXT: packuswb %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm9, %xmm8
+; SSE-NEXT: pandn %xmm1, %xmm8
+; SSE-NEXT: pand %xmm9, %xmm7
+; SSE-NEXT: por %xmm7, %xmm8
+; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,3,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: packuswb %xmm3, %xmm7
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
-; SSE-NEXT: pand %xmm15, %xmm0
-; SSE-NEXT: pandn %xmm1, %xmm15
-; SSE-NEXT: por %xmm0, %xmm15
-; SSE-NEXT: packuswb %xmm15, %xmm15
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT: pand %xmm0, %xmm15
-; SSE-NEXT: pandn %xmm5, %xmm0
-; SSE-NEXT: por %xmm0, %xmm15
-; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE-NEXT: pand %xmm14, %xmm0
+; SSE-NEXT: pandn %xmm1, %xmm14
+; SSE-NEXT: por %xmm0, %xmm14
+; SSE-NEXT: packuswb %xmm14, %xmm14
+; SSE-NEXT: movdqa %xmm13, %xmm0
+; SSE-NEXT: pand %xmm13, %xmm14
+; SSE-NEXT: pandn %xmm7, %xmm0
+; SSE-NEXT: por %xmm0, %xmm14
+; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
-; SSE-NEXT: pand %xmm7, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm7
-; SSE-NEXT: por %xmm1, %xmm7
-; SSE-NEXT: packuswb %xmm7, %xmm0
-; SSE-NEXT: movdqa %xmm11, %xmm1
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pandn %xmm0, %xmm5
+; SSE-NEXT: por %xmm1, %xmm5
+; SSE-NEXT: packuswb %xmm5, %xmm0
+; SSE-NEXT: movdqa %xmm9, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: pand %xmm11, %xmm15
-; SSE-NEXT: por %xmm15, %xmm1
+; SSE-NEXT: pand %xmm9, %xmm14
+; SSE-NEXT: por %xmm14, %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: pand %xmm5, %xmm4
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: pand %xmm12, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: pand %xmm4, %xmm6
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; SSE-NEXT: movdqa %xmm6, %xmm0
+; SSE-NEXT: pand %xmm11, %xmm0
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
@@ -1537,457 +1536,454 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: por (%rsp), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm13, %xmm1
-; SSE-NEXT: pand %xmm12, %xmm1
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; SSE-NEXT: movdqa %xmm15, %xmm1
+; SSE-NEXT: pand %xmm11, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT: packuswb %xmm2, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pandn %xmm2, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; SSE-NEXT: packuswb %xmm1, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa %xmm13, %xmm3
+; SSE-NEXT: pandn %xmm1, %xmm3
+; SSE-NEXT: pand %xmm13, %xmm0
; SSE-NEXT: por %xmm0, %xmm3
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT: por %xmm0, %xmm14
-; SSE-NEXT: movdqa %xmm14, %xmm0
-; SSE-NEXT: pand %xmm12, %xmm0
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; SSE-NEXT: por %xmm0, %xmm12
+; SSE-NEXT: movdqa %xmm12, %xmm0
+; SSE-NEXT: pand %xmm11, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movdqa %xmm11, %xmm2
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: pand %xmm11, %xmm3
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm5, %xmm8
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm8, %xmm0
-; SSE-NEXT: pand %xmm12, %xmm0
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: pand %xmm9, %xmm3
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm4, %xmm10
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT: movdqa %xmm10, %xmm0
+; SSE-NEXT: pand %xmm11, %xmm0
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm9, %xmm2
-; SSE-NEXT: pand %xmm12, %xmm2
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; SSE-NEXT: packuswb %xmm2, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pandn %xmm2, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE-NEXT: movdqa %xmm7, %xmm1
+; SSE-NEXT: pand %xmm11, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; SSE-NEXT: packuswb %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm13, %xmm3
+; SSE-NEXT: pandn %xmm1, %xmm3
+; SSE-NEXT: pand %xmm13, %xmm0
; SSE-NEXT: por %xmm0, %xmm3
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: por %xmm0, %xmm7
-; SSE-NEXT: movdqa %xmm7, %xmm0
-; SSE-NEXT: pand %xmm12, %xmm0
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: por %xmm0, %xmm8
+; SSE-NEXT: movdqa %xmm8, %xmm0
+; SSE-NEXT: pand %xmm11, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movdqa %xmm11, %xmm2
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: pand %xmm11, %xmm3
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
-; SSE-NEXT: movdqa %xmm4, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
+; SSE-NEXT: movdqa %xmm9, %xmm14
+; SSE-NEXT: pandn %xmm0, %xmm14
+; SSE-NEXT: pand %xmm9, %xmm3
+; SSE-NEXT: por %xmm3, %xmm14
+; SSE-NEXT: movdqa %xmm6, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; SSE-NEXT: movdqa %xmm6, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm13, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE-NEXT: packuswb %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm15, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535]
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: pandn %xmm0, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,3,2,1]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,3,2,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
; SSE-NEXT: pand %xmm3, %xmm0
; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pandn %xmm0, %xmm5
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: por %xmm2, %xmm5
-; SSE-NEXT: movdqa %xmm14, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
+; SSE-NEXT: movdqa %xmm13, %xmm4
+; SSE-NEXT: pandn %xmm0, %xmm4
+; SSE-NEXT: pand %xmm13, %xmm1
+; SSE-NEXT: por %xmm1, %xmm4
+; SSE-NEXT: movdqa %xmm12, %xmm0
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0]
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm0, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,0,3]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0]
+; SSE-NEXT: movdqa %xmm5, %xmm1
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: pand %xmm5, %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movdqa %xmm11, %xmm15
-; SSE-NEXT: pandn %xmm0, %xmm15
-; SSE-NEXT: pand %xmm11, %xmm5
-; SSE-NEXT: por %xmm5, %xmm15
-; SSE-NEXT: movdqa %xmm8, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15]
-; SSE-NEXT: movdqa %xmm8, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[3,0]
+; SSE-NEXT: movdqa %xmm9, %xmm12
+; SSE-NEXT: pandn %xmm0, %xmm12
+; SSE-NEXT: pand %xmm9, %xmm4
+; SSE-NEXT: por %xmm4, %xmm12
+; SSE-NEXT: movdqa %xmm10, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
+; SSE-NEXT: movdqa %xmm10, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[3,0]
; SSE-NEXT: movaps %xmm0, %xmm6
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm0, %xmm5
-; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE-NEXT: packuswb %xmm0, %xmm4
+; SSE-NEXT: movdqa %xmm7, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,3,2,1]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,3,2,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
; SSE-NEXT: pand %xmm3, %xmm6
; SSE-NEXT: pandn %xmm0, %xmm3
; SSE-NEXT: por %xmm6, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm5
+; SSE-NEXT: pand %xmm13, %xmm4
; SSE-NEXT: packuswb %xmm3, %xmm3
-; SSE-NEXT: pandn %xmm3, %xmm1
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm7, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
+; SSE-NEXT: pandn %xmm3, %xmm13
+; SSE-NEXT: por %xmm4, %xmm13
+; SSE-NEXT: movdqa %xmm8, %xmm0
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,0,3]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: pand %xmm11, %xmm1
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: pandn %xmm0, %xmm11
-; SSE-NEXT: por %xmm1, %xmm11
-; SSE-NEXT: movdqa %xmm11, %xmm14
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: pand %xmm12, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm5, %xmm3
+; SSE-NEXT: pandn %xmm0, %xmm5
+; SSE-NEXT: por %xmm3, %xmm5
+; SSE-NEXT: pand %xmm9, %xmm13
+; SSE-NEXT: packuswb %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm0, %xmm9
+; SSE-NEXT: por %xmm13, %xmm9
+; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: movdqa %xmm8, %xmm0
+; SSE-NEXT: pand %xmm11, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
-; SSE-NEXT: movdqa %xmm9, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm7, %xmm0
-; SSE-NEXT: pand %xmm12, %xmm0
+; SSE-NEXT: packuswb %xmm3, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pandn %xmm0, %xmm4
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT: movdqa %xmm10, %xmm0
+; SSE-NEXT: pand %xmm11, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,0,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm2, %xmm2
-; SSE-NEXT: pand %xmm9, %xmm2
-; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: pand %xmm11, %xmm1
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: pand %xmm12, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,1,0,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm5, %xmm5
+; SSE-NEXT: pand %xmm3, %xmm5
+; SSE-NEXT: por %xmm4, %xmm5
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: pand %xmm13, %xmm15
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; SSE-NEXT: movdqa %xmm15, %xmm0
+; SSE-NEXT: pand %xmm11, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
-; SSE-NEXT: packuswb %xmm0, %xmm3
+; SSE-NEXT: packuswb %xmm0, %xmm4
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; SSE-NEXT: movdqa %xmm0, %xmm13
-; SSE-NEXT: pandn %xmm3, %xmm13
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: por %xmm2, %xmm13
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm12, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm9, %xmm3
-; SSE-NEXT: pandn %xmm2, %xmm3
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm8, %xmm2
-; SSE-NEXT: pand %xmm12, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,1,0,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm5, %xmm5
-; SSE-NEXT: pand %xmm9, %xmm5
-; SSE-NEXT: por %xmm3, %xmm5
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: pand %xmm11, %xmm2
-; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; SSE-NEXT: por %xmm2, %xmm11
-; SSE-NEXT: pand %xmm11, %xmm12
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; SSE-NEXT: packuswb %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pandn %xmm4, %xmm6
; SSE-NEXT: pand %xmm0, %xmm5
-; SSE-NEXT: por %xmm5, %xmm2
-; SSE-NEXT: movdqa %xmm4, %xmm3
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3]
-; SSE-NEXT: psrlq $48, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7]
-; SSE-NEXT: packuswb %xmm5, %xmm3
-; SSE-NEXT: movdqa %xmm9, %xmm6
-; SSE-NEXT: pandn %xmm3, %xmm6
-; SSE-NEXT: movdqa %xmm7, %xmm3
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: pandn %xmm3, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[3,1,1,2,4,5,6,7]
-; SSE-NEXT: pand %xmm5, %xmm7
-; SSE-NEXT: por %xmm4, %xmm7
-; SSE-NEXT: packuswb %xmm7, %xmm7
-; SSE-NEXT: pand %xmm9, %xmm7
-; SSE-NEXT: por %xmm6, %xmm7
+; SSE-NEXT: por %xmm5, %xmm6
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm11, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: movdqa %xmm3, %xmm5
+; SSE-NEXT: pandn %xmm4, %xmm5
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,5,7,4]
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,0,0]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pandn %xmm6, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm4
-; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: packuswb %xmm3, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm6
-; SSE-NEXT: pandn %xmm3, %xmm6
+; SSE-NEXT: pand %xmm11, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[2,1,0,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm7, %xmm7
+; SSE-NEXT: pand %xmm3, %xmm7
+; SSE-NEXT: por %xmm5, %xmm7
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: pand %xmm13, %xmm4
+; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; SSE-NEXT: por %xmm4, %xmm13
+; SSE-NEXT: pand %xmm13, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
+; SSE-NEXT: packuswb %xmm4, %xmm5
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: pandn %xmm5, %xmm4
; SSE-NEXT: pand %xmm0, %xmm7
-; SSE-NEXT: por %xmm7, %xmm6
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm7, %xmm3
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSE-NEXT: movdqa %xmm7, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3]
-; SSE-NEXT: psrlq $48, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7]
-; SSE-NEXT: packuswb %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm8, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,1,2,4,5,6,7]
-; SSE-NEXT: pand %xmm5, %xmm7
-; SSE-NEXT: pandn %xmm4, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
+; SSE-NEXT: por %xmm7, %xmm4
+; SSE-NEXT: movdqa %xmm8, %xmm5
+; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
+; SSE-NEXT: movdqa %xmm8, %xmm7
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[2,3]
+; SSE-NEXT: psrlq $48, %xmm5
+; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,7]
+; SSE-NEXT: packuswb %xmm7, %xmm5
+; SSE-NEXT: movdqa %xmm3, %xmm7
+; SSE-NEXT: pandn %xmm5, %xmm7
+; SSE-NEXT: movdqa %xmm10, %xmm5
+; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,5,5,5,5]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm5, %xmm9
+; SSE-NEXT: pandn %xmm8, %xmm9
+; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[3,1,1,2,4,5,6,7]
+; SSE-NEXT: pand %xmm5, %xmm10
+; SSE-NEXT: por %xmm9, %xmm10
+; SSE-NEXT: packuswb %xmm10, %xmm10
+; SSE-NEXT: pand %xmm3, %xmm10
+; SSE-NEXT: por %xmm7, %xmm10
+; SSE-NEXT: movdqa %xmm15, %xmm8
+; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,7,4]
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,0]
+; SSE-NEXT: movdqa %xmm7, %xmm11
+; SSE-NEXT: pandn %xmm9, %xmm11
+; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pand %xmm7, %xmm8
+; SSE-NEXT: por %xmm8, %xmm11
+; SSE-NEXT: packuswb %xmm11, %xmm9
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: pandn %xmm9, %xmm8
+; SSE-NEXT: pand %xmm0, %xmm10
+; SSE-NEXT: por %xmm10, %xmm8
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm11, %xmm9
+; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
+; SSE-NEXT: movdqa %xmm11, %xmm10
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[2,3]
+; SSE-NEXT: psrlq $48, %xmm9
+; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7]
+; SSE-NEXT: packuswb %xmm10, %xmm9
+; SSE-NEXT: movdqa %xmm1, %xmm10
+; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,1,2,4,5,6,7]
+; SSE-NEXT: pand %xmm5, %xmm11
+; SSE-NEXT: pandn %xmm10, %xmm5
+; SSE-NEXT: por %xmm11, %xmm5
; SSE-NEXT: packuswb %xmm5, %xmm5
-; SSE-NEXT: pand %xmm9, %xmm5
-; SSE-NEXT: pandn %xmm3, %xmm9
-; SSE-NEXT: por %xmm9, %xmm5
-; SSE-NEXT: movdqa %xmm11, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,1,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4]
-; SSE-NEXT: pandn %xmm4, %xmm1
-; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm5
+; SSE-NEXT: pandn %xmm9, %xmm3
+; SSE-NEXT: por %xmm3, %xmm5
+; SSE-NEXT: movdqa %xmm13, %xmm3
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pand %xmm7, %xmm2
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,4]
+; SSE-NEXT: pandn %xmm3, %xmm7
+; SSE-NEXT: por %xmm2, %xmm7
; SSE-NEXT: pand %xmm0, %xmm5
-; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: pandn %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm7, %xmm2
+; SSE-NEXT: pandn %xmm2, %xmm0
; SSE-NEXT: por %xmm5, %xmm0
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, 16(%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, (%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: movaps %xmm2, 16(%rsi)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: movaps %xmm2, (%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 16(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, (%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, 16(%rcx)
+; SSE-NEXT: movdqa %xmm14, 16(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, (%rcx)
-; SSE-NEXT: movdqa %xmm14, 16(%r8)
-; SSE-NEXT: movdqa %xmm15, (%r8)
-; SSE-NEXT: movdqa %xmm2, 16(%r9)
-; SSE-NEXT: movdqa %xmm13, (%r9)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, 16(%r8)
+; SSE-NEXT: movdqa %xmm12, (%r8)
+; SSE-NEXT: movdqa %xmm4, 16(%r9)
+; SSE-NEXT: movdqa %xmm6, (%r9)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movdqa %xmm0, 16(%rax)
-; SSE-NEXT: movdqa %xmm6, (%rax)
-; SSE-NEXT: addq $264, %rsp # imm = 0x108
+; SSE-NEXT: movdqa %xmm8, (%rax)
+; SSE-NEXT: addq $280, %rsp # imm = 0x118
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i8_stride6_vf32:
; AVX1: # %bb.0:
; AVX1-NEXT: subq $168, %rsp
-; AVX1-NEXT: vmovdqa 128(%rdi), %xmm14
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vmovdqa 144(%rdi), %xmm7
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vmovdqa 128(%rdi), %xmm4
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vmovdqa 144(%rdi), %xmm5
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-NEXT: vmovdqa 176(%rdi), %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10]
-; AVX1-NEXT: vmovdqa %xmm2, %xmm15
-; AVX1-NEXT: vmovdqa 160(%rdi), %xmm3
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
-; AVX1-NEXT: vmovdqa %xmm3, %xmm6
+; AVX1-NEXT: vmovdqa 176(%rdi), %xmm6
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[4,10]
+; AVX1-NEXT: vmovdqa 160(%rdi), %xmm7
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,255,255,255,255,255,0,0,0,0,0>
-; AVX1-NEXT: vpblendvb %xmm8, %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,255,255,255,255,255,0,0,0,0,0>
+; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[5,11]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[5,11]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendvb %xmm8, %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm11
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm13
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm1, %xmm13, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm8, %xmm11, %xmm3
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm15
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm14
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm15, %xmm3
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; AVX1-NEXT: vmovdqa (%rdi), %xmm10
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm9
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[4,10,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[2,8,14],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u>
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm12, %xmm13, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm5, %xmm11, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[5,11,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[3,9,15],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa (%rdi), %xmm9
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm8
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[4,10,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[2,8,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm2, %xmm10, %xmm10
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u>
+; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm10, %xmm3
+; AVX1-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm14, %xmm10
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm12
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm8[5,11,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[3,9,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm12, %xmm13, %xmm12
+; AVX1-NEXT: vpblendvb %xmm2, %xmm10, %xmm12, %xmm0
; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm0
-; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm8, %xmm14, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
-; AVX1-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[0,6,12]
-; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,255,255,255,255,255,255,0,0,0,0,0>
-; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[0,6,12]
+; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,255,255,255,255,255,255,0,0,0,0,0>
+; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm0
-; AVX1-NEXT: vpshufb %xmm5, %xmm14, %xmm1
+; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm0
+; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[1,7,13]
-; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[1,7,13]
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[4,10],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm9[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm8[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u>
; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[5,11],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm9[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm8[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,128,2,8,14,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = <128,128,128,2,8,14,u,u,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vmovdqa 112(%rdi), %xmm3
-; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm1
+; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,6,12,128,128,128,u,u,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm2
@@ -1995,48 +1991,48 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10]
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
-; AVX1-NEXT: vpor %xmm4, %xmm15, %xmm4
+; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
+; AVX1-NEXT: vpor %xmm4, %xmm13, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX1-NEXT: vpshufb %xmm12, %xmm9, %xmm7
-; AVX1-NEXT: vpshufb %xmm6, %xmm10, %xmm6
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
+; AVX1-NEXT: vpshufb %xmm10, %xmm8, %xmm10
+; AVX1-NEXT: vpshufb %xmm6, %xmm9, %xmm6
+; AVX1-NEXT: vpor %xmm6, %xmm10, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4,5],xmm6[6,7]
; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandnps %ymm4, %ymm6, %ymm4
; AVX1-NEXT: vandps %ymm6, %ymm5, %ymm5
; AVX1-NEXT: vorps %ymm4, %ymm5, %ymm4
-; AVX1-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
-; AVX1-NEXT: vandps %ymm4, %ymm15, %ymm4
-; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm7 # 16-byte Folded Reload
-; AVX1-NEXT: vandnps %ymm7, %ymm15, %ymm7
-; AVX1-NEXT: vorps %ymm7, %ymm4, %ymm4
+; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
+; AVX1-NEXT: vandps %ymm5, %ymm4, %ymm4
+; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm10 # 16-byte Folded Reload
+; AVX1-NEXT: vandnps %ymm10, %ymm5, %ymm10
+; AVX1-NEXT: vorps %ymm4, %ymm10, %ymm4
; AVX1-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,3,9,15,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <1,7,13,128,128,128,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm12
-; AVX1-NEXT: vpor %xmm7, %xmm12, %xmm12
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
-; AVX1-NEXT: vpor %xmm7, %xmm14, %xmm7
-; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm12
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; AVX1-NEXT: vpshufb %xmm5, %xmm9, %xmm7
-; AVX1-NEXT: vpshufb %xmm8, %xmm10, %xmm5
-; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7]
-; AVX1-NEXT: vandnps %ymm12, %ymm6, %ymm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,9,15,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm13
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <1,7,13,128,128,128,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm12
+; AVX1-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
+; AVX1-NEXT: vpor %xmm13, %xmm11, %xmm11
+; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11
+; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm4
+; AVX1-NEXT: vpor %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm12[3,4,5],xmm4[6,7]
+; AVX1-NEXT: vandnps %ymm11, %ymm6, %ymm7
; AVX1-NEXT: vandps %ymm6, %ymm4, %ymm4
-; AVX1-NEXT: vorps %ymm5, %ymm4, %ymm4
-; AVX1-NEXT: vandps %ymm4, %ymm15, %ymm4
-; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
-; AVX1-NEXT: vandnps %ymm5, %ymm15, %ymm5
+; AVX1-NEXT: vorps %ymm7, %ymm4, %ymm4
+; AVX1-NEXT: vandps %ymm5, %ymm4, %ymm4
+; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload
+; AVX1-NEXT: vandnps %ymm6, %ymm5, %ymm5
; AVX1-NEXT: vorps %ymm5, %ymm4, %ymm4
; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[4,10,u,u,u,u,u,u,u,u,u,u,u]
@@ -2046,87 +2042,87 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12]
; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandnps %ymm5, %ymm7, %ymm5
-; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
-; AVX1-NEXT: vorps %ymm5, %ymm6, %ymm6
-; AVX1-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT: vandps %ymm6, %ymm15, %ymm6
+; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vandnps %ymm5, %ymm6, %ymm5
+; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
+; AVX1-NEXT: vorps %ymm5, %ymm7, %ymm7
+; AVX1-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %ymm7, %ymm10, %ymm7
; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 16-byte Folded Reload
-; AVX1-NEXT: vandnps %ymm8, %ymm15, %ymm8
-; AVX1-NEXT: vorps %ymm6, %ymm8, %ymm9
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[5,11,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm10
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13]
-; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4
-; AVX1-NEXT: vandnps %ymm4, %ymm7, %ymm4
-; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
-; AVX1-NEXT: vorps %ymm4, %ymm6, %ymm4
-; AVX1-NEXT: vandps %ymm4, %ymm15, %ymm4
-; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload
-; AVX1-NEXT: vandnps %ymm6, %ymm15, %ymm6
-; AVX1-NEXT: vorps %ymm6, %ymm4, %ymm10
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[4,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,128,128,128,2,8,14>
-; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,u,u,0,6,12,128,128,128>
-; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm7
-; AVX1-NEXT: vpor %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX1-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX1-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
-; AVX1-NEXT: vandps %ymm4, %ymm13, %ymm4
-; AVX1-NEXT: vorps %ymm7, %ymm4, %ymm14
+; AVX1-NEXT: vandnps %ymm8, %ymm10, %ymm8
+; AVX1-NEXT: vorps %ymm7, %ymm8, %ymm13
+; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm3[5,11,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13]
+; AVX1-NEXT: vpor %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
+; AVX1-NEXT: vandnps %ymm7, %ymm6, %ymm7
+; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX1-NEXT: vorps %ymm7, %ymm6, %ymm6
+; AVX1-NEXT: vandps %ymm6, %ymm10, %ymm6
+; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload
+; AVX1-NEXT: vandnps %ymm7, %ymm10, %ymm7
+; AVX1-NEXT: vorps %ymm7, %ymm6, %ymm6
+; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,u,u,128,128,128,2,8,14>
+; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,0,6,12,128,128,128>
+; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm12
+; AVX1-NEXT: vpor %xmm9, %xmm12, %xmm9
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7
+; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload
+; AVX1-NEXT: vandps %ymm7, %ymm9, %ymm7
+; AVX1-NEXT: vorps %ymm7, %ymm12, %ymm7
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm7
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm12, %xmm8, %xmm6
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm11
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm12[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm11[5,6,7]
-; AVX1-NEXT: vandps %ymm15, %ymm14, %ymm11
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX1-NEXT: vandnps %ymm4, %ymm15, %ymm4
-; AVX1-NEXT: vorps %ymm4, %ymm11, %ymm11
+; AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm8
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm11
+; AVX1-NEXT: vpor %xmm8, %xmm11, %xmm8
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm15[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3,4],xmm8[5,6,7]
+; AVX1-NEXT: vandps %ymm7, %ymm10, %ymm7
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX1-NEXT: vandnps %ymm8, %ymm10, %ymm8
+; AVX1-NEXT: vorps %ymm7, %ymm8, %ymm7
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,u,128,128,128,3,9,15>
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,u,u,1,7,13,128,128,128>
-; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,u,u,1,7,13,128,128,128>
+; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
-; AVX1-NEXT: vandps %ymm0, %ymm13, %ymm0
+; AVX1-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload
+; AVX1-NEXT: vandps %ymm0, %ymm9, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm1
-; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm2
+; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm2
; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX1-NEXT: vandps %ymm0, %ymm15, %ymm0
+; AVX1-NEXT: vandps %ymm0, %ymm10, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vandnps %ymm1, %ymm15, %ymm1
+; AVX1-NEXT: vandnps %ymm1, %ymm10, %ymm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm1, (%rsi)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm1, (%rdx)
-; AVX1-NEXT: vmovaps %ymm9, (%rcx)
-; AVX1-NEXT: vmovaps %ymm10, (%r8)
-; AVX1-NEXT: vmovaps %ymm11, (%r9)
+; AVX1-NEXT: vmovaps %ymm13, (%rcx)
+; AVX1-NEXT: vmovaps %ymm6, (%r8)
+; AVX1-NEXT: vmovaps %ymm7, (%r9)
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-NEXT: vmovaps %ymm0, (%rax)
; AVX1-NEXT: addq $168, %rsp
@@ -2135,117 +2131,114 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX2-LABEL: load_i8_stride6_vf32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm14
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm13
+; AVX2-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4
-; AVX2-NEXT: vmovdqa 160(%rdi), %ymm12
+; AVX2-NEXT: vmovdqa 160(%rdi), %ymm5
; AVX2-NEXT: vmovdqa 128(%rdi), %ymm6
; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
-; AVX2-NEXT: vpblendvb %ymm9, %ymm14, %ymm13, %ymm7
+; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm7
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
-; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[0,6,12,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm10
+; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm3, %xmm8, %xmm11
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm4[2,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,10],zero,zero,zero,ymm3[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[20,26]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm4[0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,8,14],zero,zero,ymm4[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,24,30],zero,zero
; AVX2-NEXT: vpor %ymm0, %ymm8, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm8, %ymm11, %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[21,27]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,9,15],zero,zero,ymm4[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,25,31],zero,zero
-; AVX2-NEXT: vpor %ymm2, %ymm7, %ymm2
-; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
-; AVX2-NEXT: vpblendvb %ymm0, %ymm13, %ymm14, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[2,8,14],zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm1, %xmm7, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm3[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,22,28]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,10],zero,zero,zero,ymm4[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[20,26],zero,zero,zero
-; AVX2-NEXT: vpor %ymm7, %ymm10, %ymm7
+; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm7, %xmm10, %xmm7
+; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,ymm3[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[21,27]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm4[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,9,15],zero,zero,ymm4[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,25,31],zero,zero
+; AVX2-NEXT: vpor %ymm10, %ymm11, %ymm10
+; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm10
+; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11
+; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm13, %xmm14, %xmm13
+; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,ymm3[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,22,28]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,10],zero,zero,zero,ymm4[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[20,26],zero,zero,zero
+; AVX2-NEXT: vpor %ymm14, %ymm15, %ymm14
; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
-; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm9
-; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,23,29]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[5,11],zero,zero,zero,ymm4[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[21,27],zero,zero,zero
-; AVX2-NEXT: vpor %ymm2, %ymm7, %ymm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero
-; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12]
-; AVX2-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm9
+; AVX2-NEXT: vpblendvb %ymm8, %ymm13, %ymm14, %ymm13
+; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm11, %xmm10, %xmm10
+; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,ymm3[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,23,29]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[5,11],zero,zero,zero,ymm4[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[21,27],zero,zero,zero
+; AVX2-NEXT: vpor %ymm11, %ymm14, %ymm11
+; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero
+; AVX2-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm8
+; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u],zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12]
+; AVX2-NEXT: vpor %xmm14, %xmm10, %xmm10
+; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm5, %ymm8
-; AVX2-NEXT: vpblendvb %ymm15, %ymm12, %ymm6, %ymm10
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13]
-; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm9
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
-; AVX2-NEXT: vpblendvb %ymm0, %ymm12, %ymm6, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[4,10],zero,zero,zero,xmm1[2,8,14]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,2,8,14],zero,zero,xmm0[0,6,12],zero,zero,zero
-; AVX2-NEXT: vpor %xmm2, %xmm5, %xmm12
-; AVX2-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm5
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[4,10,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[4,10],zero,zero,zero,xmm5[2,8,14],zero,zero,xmm5[u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,22,28],zero,zero,zero
-; AVX2-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,ymm3[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,8,14],zero,zero,ymm3[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,24,30]
-; AVX2-NEXT: vpor %ymm7, %ymm13, %ymm7
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm7[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm7
-; AVX2-NEXT: vpblendvb %ymm11, %ymm2, %ymm7, %ymm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[5,11],zero,zero,zero,xmm1[3,9,15]
+; AVX2-NEXT: vpblendvb %ymm11, %ymm13, %ymm10, %ymm13
+; AVX2-NEXT: vpblendvb %ymm15, %ymm5, %ymm6, %ymm10
+; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13]
+; AVX2-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm8, %ymm9
+; AVX2-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[2,8,14]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,2,8,14],zero,zero,xmm0[0,6,12],zero,zero,zero
+; AVX2-NEXT: vpor %xmm6, %xmm8, %xmm6
+; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[4,10],zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm8, %xmm12, %xmm8
+; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = ymm4[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,22,28],zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,ymm3[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,8,14],zero,zero,ymm3[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,24,30]
+; AVX2-NEXT: vpor %ymm12, %ymm14, %ymm12
+; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm12[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT: vpblendvb %ymm11, %ymm8, %ymm6, %ymm6
+; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[3,9,15]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,3,9,15],zero,zero,xmm0[1,7,13],zero,zero,zero
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[5,11,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,11],zero,zero,zero,xmm5[3,9,15],zero,zero,xmm5[u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[17,23,29],zero,zero,zero
+; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[17,23,29],zero,zero,zero
; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,9,15],zero,zero,ymm3[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,25,31]
-; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm3
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero
-; AVX2-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u],zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[4,10]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero
+; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero
-; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[5,11]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero
+; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX2-NEXT: vmovdqa %ymm3, (%rdx)
-; AVX2-NEXT: vmovdqa %ymm8, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm2, (%rdx)
+; AVX2-NEXT: vmovdqa %ymm13, (%rcx)
; AVX2-NEXT: vmovdqa %ymm9, (%r8)
-; AVX2-NEXT: vmovdqa %ymm2, (%r9)
+; AVX2-NEXT: vmovdqa %ymm6, (%r9)
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: vmovdqa %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
@@ -2253,118 +2246,118 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-LABEL: load_i8_stride6_vf32:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: vmovdqa (%rdi), %ymm10
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm13
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vmovdqa (%rdi), %ymm6
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2
-; AVX512-NEXT: vmovdqa 128(%rdi), %ymm9
-; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm11
-; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,8,14],zero,zero,ymm11[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[18,24,30],zero,zero
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm2[2,3],mem[2,3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm12[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[4,10],zero,zero,zero,ymm12[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,26]
+; AVX512-NEXT: vmovdqa 128(%rdi), %ymm3
+; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm0
+; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,8,14],zero,zero,ymm0[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,24,30],zero,zero
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm2[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,10],zero,zero,zero,ymm2[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[20,26]
; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4
-; AVX512-NEXT: movw $18724, %ax # imm = 0x4924
-; AVX512-NEXT: kmovd %eax, %k1
-; AVX512-NEXT: vpblendmw %ymm13, %ymm10, %ymm7 {%k1}
+; AVX512-NEXT: movw $18724, %r10w # imm = 0x4924
+; AVX512-NEXT: kmovd %r10d, %k1
+; AVX512-NEXT: vpblendmw %ymm1, %ymm6, %ymm7 {%k1}
; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512-NEXT: movl $4192256, %eax # imm = 0x3FF800
-; AVX512-NEXT: kmovd %eax, %k2
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm5, %xmm9, %xmm5
+; AVX512-NEXT: movl $4192256, %r10d # imm = 0x3FF800
+; AVX512-NEXT: kmovd %r10d, %k2
; AVX512-NEXT: vmovdqu8 %ymm4, %ymm5 {%k2}
; AVX512-NEXT: vmovdqa 160(%rdi), %ymm4
-; AVX512-NEXT: vpblendmw %ymm9, %ymm4, %ymm6 {%k1}
-; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u],zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,4,10],zero,zero,zero,xmm6[2,8,14],zero,zero
-; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,9,15],zero,zero,ymm11[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[19,25,31],zero,zero
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm12[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[5,11],zero,zero,zero,ymm12[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[21,27]
-; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vmovdqu8 %ymm1, %ymm2 {%k2}
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,xmm0[1,7,13],zero,zero,zero,xmm0[5,11]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,5,11],zero,zero,zero,xmm6[3,9,15],zero,zero
-; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,10],zero,zero,zero,ymm11[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,26],zero,zero,zero
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm12[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,6,12],zero,zero,zero,ymm12[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,22,28]
-; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: movw $9362, %ax # imm = 0x2492
-; AVX512-NEXT: kmovd %eax, %k3
-; AVX512-NEXT: vpblendmw %ymm10, %ymm13, %ymm1 {%k3}
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm8
-; AVX512-NEXT: movl $2095104, %eax # imm = 0x1FF800
-; AVX512-NEXT: kmovd %eax, %k4
-; AVX512-NEXT: vmovdqu8 %ymm0, %ymm8 {%k4}
-; AVX512-NEXT: vpblendmw %ymm4, %ymm9, %ymm0 {%k1}
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,0,6,12],zero,zero,zero,xmm0[4,10],zero,zero,zero
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12]
-; AVX512-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: movl $-2097152, %eax # imm = 0xFFE00000
-; AVX512-NEXT: kmovd %eax, %k2
-; AVX512-NEXT: vmovdqu8 %ymm3, %ymm8 {%k2}
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[5,11],zero,zero,zero,ymm11[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[21,27],zero,zero,zero
-; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm12[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[1,7,13],zero,zero,zero,ymm12[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[17,23,29]
-; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[3,9,15],zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqu8 %ymm3, %ymm1 {%k4}
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,7,13],zero,zero,zero,xmm0[5,11],zero,zero,zero
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13]
-; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqu8 %ymm0, %ymm1 {%k2}
-; AVX512-NEXT: vmovdqu16 %ymm10, %ymm13 {%k1}
-; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[4,10],zero,zero,zero,xmm13[2,8,14],zero,zero,xmm13[u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm12[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,8,14],zero,zero,ymm12[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[18,24,30]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[0,6,12],zero,zero,zero,ymm11[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,22,28],zero,zero,zero
-; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vmovdqu16 %ymm9, %ymm4 {%k3}
+; AVX512-NEXT: vpblendmw %ymm3, %ymm4, %ymm9 {%k1}
+; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[4,10]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero
+; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11
+; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm11[3,4,5,6,7],ymm5[8,9,10],ymm11[11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[3,9,15],zero,zero,ymm0[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[19,25,31],zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,ymm2[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,11],zero,zero,zero,ymm2[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[21,27]
+; AVX512-NEXT: vpor %ymm11, %ymm12, %ymm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512-NEXT: vmovdqu8 %ymm11, %ymm7 {%k2}
+; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[5,11]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero
+; AVX512-NEXT: vpor %xmm8, %xmm9, %xmm8
+; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,10],zero,zero,zero,ymm0[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[20,26],zero,zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm2[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,6,12],zero,zero,zero,ymm2[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,22,28]
+; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm9
+; AVX512-NEXT: movw $9362, %di # imm = 0x2492
+; AVX512-NEXT: kmovd %edi, %k3
+; AVX512-NEXT: vpblendmw %ymm6, %ymm1, %ymm10 {%k3}
+; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm8, %xmm12, %xmm8
+; AVX512-NEXT: movl $2095104, %edi # imm = 0x1FF800
+; AVX512-NEXT: kmovd %edi, %k4
+; AVX512-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4}
+; AVX512-NEXT: vpblendmw %ymm4, %ymm3, %ymm9 {%k1}
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero
+; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm13
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[2,8,14],zero,zero,xmm13[0,6,12]
+; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12
+; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512-NEXT: movl $-2097152, %edi # imm = 0xFFE00000
+; AVX512-NEXT: kmovd %edi, %k2
+; AVX512-NEXT: vmovdqu8 %ymm12, %ymm8 {%k2}
+; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,11],zero,zero,zero,ymm0[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[21,27],zero,zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,ymm2[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,7,13],zero,zero,zero,ymm2[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[17,23,29]
+; AVX512-NEXT: vpor %ymm12, %ymm14, %ymm12
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm11, %xmm10, %xmm10
+; AVX512-NEXT: vmovdqu8 %ymm12, %ymm10 {%k4}
+; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[3,9,15],zero,zero,xmm13[1,7,13]
+; AVX512-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-NEXT: vmovdqu8 %ymm9, %ymm10 {%k2}
+; AVX512-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1}
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[4,10,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[4,10],zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,ymm2[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14],zero,zero,ymm2[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18,24,30]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,6,12],zero,zero,zero,ymm0[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,22,28],zero,zero,zero
+; AVX512-NEXT: vpor %ymm11, %ymm12, %ymm11
+; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-NEXT: vmovdqu16 %ymm3, %ymm4 {%k3}
; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u],zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[2,8,14]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero
-; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vmovdqu8 %ymm5, %ymm2 {%k2}
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,7,13],zero,zero,zero,xmm0[5,11,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[5,11],zero,zero,zero,xmm13[3,9,15],zero,zero,xmm13[u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm0, %xmm5, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm12[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[3,9,15],zero,zero,ymm12[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[19,25,31]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm11[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[1,7,13],zero,zero,zero,ymm11[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[17,23,29],zero,zero,zero
-; AVX512-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[3,9,15]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero
-; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vmovdqu8 %ymm3, %ymm0 {%k2}
-; AVX512-NEXT: vmovdqa %ymm14, (%rsi)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u],zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[2,8,14]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero
+; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11
+; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT: vmovdqu8 %ymm11, %ymm9 {%k2}
+; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[5,11,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm6, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15],zero,zero,ymm2[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,25,31]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[1,7,13],zero,zero,zero,ymm0[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[17,23,29],zero,zero,zero
+; AVX512-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u],zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[3,9,15]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero
+; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
+; AVX512-NEXT: vmovdqa %ymm5, (%rsi)
; AVX512-NEXT: vmovdqa %ymm7, (%rdx)
; AVX512-NEXT: vmovdqa %ymm8, (%rcx)
-; AVX512-NEXT: vmovdqa %ymm1, (%r8)
-; AVX512-NEXT: vmovdqa %ymm2, (%r9)
-; AVX512-NEXT: vmovdqa %ymm0, (%r10)
+; AVX512-NEXT: vmovdqa %ymm10, (%r8)
+; AVX512-NEXT: vmovdqa %ymm9, (%r9)
+; AVX512-NEXT: vmovdqa %ymm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%wide.vec = load <192 x i8>, ptr %in.vec, align 32
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll
index 34ba67925dd65..536a7a89841dc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll
@@ -192,9 +192,9 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
; SSE-NEXT: movdqa (%rsi), %xmm4
; SSE-NEXT: movdqa 16(%rsi), %xmm5
; SSE-NEXT: movdqa 32(%rsi), %xmm6
-; SSE-NEXT: movdqa 48(%rsi), %xmm8
-; SSE-NEXT: movdqa %xmm0, %xmm7
-; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE-NEXT: movdqa 48(%rsi), %xmm7
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
@@ -203,8 +203,8 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
; SSE-NEXT: movdqa %xmm3, %xmm6
-; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
; SSE-NEXT: movdqa %xmm3, 96(%rdx)
; SSE-NEXT: movdqa %xmm6, 112(%rdx)
; SSE-NEXT: movdqa %xmm2, 64(%rdx)
@@ -212,7 +212,7 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
; SSE-NEXT: movdqa %xmm1, 32(%rdx)
; SSE-NEXT: movdqa %xmm4, 48(%rdx)
; SSE-NEXT: movdqa %xmm0, (%rdx)
-; SSE-NEXT: movdqa %xmm7, 16(%rdx)
+; SSE-NEXT: movdqa %xmm8, 16(%rdx)
; SSE-NEXT: retq
;
; AVX-LABEL: vf32:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index f3d003dfa1e88..26dca3a38ba51 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -308,77 +308,77 @@ define void @vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec
define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
; SSE-LABEL: vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm8
-; SSE-NEXT: movdqa 16(%rdi), %xmm10
-; SSE-NEXT: movdqa (%rsi), %xmm12
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm6
+; SSE-NEXT: movdqa (%rsi), %xmm2
; SSE-NEXT: movdqa 16(%rsi), %xmm7
-; SSE-NEXT: movdqa (%rdx), %xmm9
-; SSE-NEXT: movdqa 16(%rdx), %xmm11
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: pandn %xmm3, %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: movdqa (%rdx), %xmm4
+; SSE-NEXT: movdqa 16(%rdx), %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2]
+; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT: pand %xmm0, %xmm3
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: pandn %xmm5, %xmm8
+; SSE-NEXT: por %xmm3, %xmm8
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,2]
+; SSE-NEXT: pand %xmm3, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: pandn %xmm1, %xmm5
-; SSE-NEXT: por %xmm0, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0]
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm0, %xmm4
-; SSE-NEXT: movdqa %xmm10, %xmm0
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,5,4,5]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: por %xmm4, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,2]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: pandn %xmm4, %xmm6
-; SSE-NEXT: por %xmm0, %xmm6
-; SSE-NEXT: pand %xmm3, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,2]
-; SSE-NEXT: pandn %xmm0, %xmm3
-; SSE-NEXT: por %xmm6, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0]
-; SSE-NEXT: movdqa %xmm8, %xmm4
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,4,5]
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: movdqa %xmm4, %xmm6
-; SSE-NEXT: pandn %xmm0, %xmm6
-; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,1,3,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7]
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: por %xmm6, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,2,3,3]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,1,3,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,1,0,4,5,6,7]
-; SSE-NEXT: pand %xmm4, %xmm7
-; SSE-NEXT: pandn %xmm6, %xmm4
-; SSE-NEXT: por %xmm7, %xmm4
-; SSE-NEXT: movdqa %xmm4, 32(%rcx)
-; SSE-NEXT: movdqa %xmm0, 80(%rcx)
-; SSE-NEXT: movdqa %xmm2, (%rcx)
+; SSE-NEXT: pandn %xmm10, %xmm5
+; SSE-NEXT: por %xmm8, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,0,0]
+; SSE-NEXT: movdqa %xmm0, %xmm10
+; SSE-NEXT: pandn %xmm8, %xmm10
+; SSE-NEXT: movdqa %xmm6, %xmm8
+; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,4,5]
+; SSE-NEXT: pand %xmm0, %xmm8
+; SSE-NEXT: por %xmm10, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,2,2]
+; SSE-NEXT: pand %xmm0, %xmm10
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm2[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm0, %xmm12
+; SSE-NEXT: pandn %xmm11, %xmm12
+; SSE-NEXT: por %xmm10, %xmm12
+; SSE-NEXT: pand %xmm3, %xmm12
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,2,2]
+; SSE-NEXT: pandn %xmm10, %xmm3
+; SSE-NEXT: por %xmm12, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0]
+; SSE-NEXT: movdqa %xmm1, %xmm11
+; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,4,5]
+; SSE-NEXT: pand %xmm0, %xmm11
+; SSE-NEXT: pandn %xmm10, %xmm0
+; SSE-NEXT: por %xmm11, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
+; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: movdqa %xmm10, %xmm11
+; SSE-NEXT: pandn %xmm9, %xmm11
+; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,1,3,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,1,0,4,5,6,7]
+; SSE-NEXT: pand %xmm10, %xmm6
+; SSE-NEXT: por %xmm11, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7]
+; SSE-NEXT: pand %xmm10, %xmm1
+; SSE-NEXT: pandn %xmm4, %xmm10
+; SSE-NEXT: por %xmm1, %xmm10
+; SSE-NEXT: movdqa %xmm10, 32(%rcx)
+; SSE-NEXT: movdqa %xmm6, 80(%rcx)
+; SSE-NEXT: movdqa %xmm0, (%rcx)
; SSE-NEXT: movdqa %xmm3, 16(%rcx)
-; SSE-NEXT: movdqa %xmm1, 48(%rcx)
+; SSE-NEXT: movdqa %xmm8, 48(%rcx)
; SSE-NEXT: movdqa %xmm5, 64(%rcx)
; SSE-NEXT: retq
;
@@ -395,25 +395,25 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.ve
; AVX1-NEXT: vmovdqa (%rdx), %xmm5
; AVX1-NEXT: vmovdqa 16(%rdx), %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3,4],xmm2[5],xmm7[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm2[1],xmm7[2,3],xmm2[4],xmm7[5,6],xmm2[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,0,0,0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3,4],xmm2[5],xmm7[6,7]
+; AVX1-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,0,0,0]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
@@ -422,10 +422,10 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.ve
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7]
; AVX1-NEXT: vmovdqa %xmm0, 32(%rcx)
; AVX1-NEXT: vmovdqa %xmm1, 48(%rcx)
-; AVX1-NEXT: vmovdqa %xmm2, (%rcx)
-; AVX1-NEXT: vmovdqa %xmm11, 80(%rcx)
-; AVX1-NEXT: vmovdqa %xmm10, 16(%rcx)
-; AVX1-NEXT: vmovdqa %xmm8, 64(%rcx)
+; AVX1-NEXT: vmovdqa %xmm10, (%rcx)
+; AVX1-NEXT: vmovdqa %xmm8, 80(%rcx)
+; AVX1-NEXT: vmovdqa %xmm7, 16(%rcx)
+; AVX1-NEXT: vmovdqa %xmm2, 64(%rcx)
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vf16:
@@ -475,7 +475,7 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.ve
;
; AVX2-FAST-ALL-LABEL: vf16:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm8
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm1
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm0
; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm2
; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rsi), %xmm3
@@ -483,43 +483,43 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.ve
; AVX2-FAST-ALL-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm6
; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm7
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3,4],xmm5[5],xmm1[6,7]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = <5,5,u,6,6,u,7,7>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm3, %ymm3
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm4, %xmm2, %xmm3
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm5, %ymm5
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,2]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = <u,0,0,u,1,1,u,2>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm3, %ymm3
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = <u,3,3,u,4,4,u,5>
-; AVX2-FAST-ALL-NEXT: vpermd (%rdi), %ymm3, %ymm3
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
+; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm4, %ymm4
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
+; AVX2-FAST-ALL-NEXT: vpermd (%rdi), %ymm4, %ymm4
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm4, %ymm0
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 64(%rcx)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, 64(%rcx)
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX2-FAST-ALL-NEXT: retq
;
; AVX2-FAST-PERLANE-LABEL: vf16:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm8
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3
@@ -527,37 +527,37 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.ve
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3,4],xmm5[5],xmm1[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <5,5,u,6,6,u,7,7>
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm3, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm3
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7>
+; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm5, %ymm5
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <u,0,0,u,1,1,u,2>
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm3, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <u,3,3,u,4,4,u,5>
-; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm3, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm4
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
+; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm4, %ymm4
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rcx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%rcx)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
@@ -591,98 +591,101 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.ve
define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
; SSE-LABEL: vf32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa 16(%rdi), %xmm8
-; SSE-NEXT: movdqa 32(%rdi), %xmm11
+; SSE-NEXT: movdqa 16(%rdi), %xmm6
+; SSE-NEXT: movdqa 32(%rdi), %xmm4
; SSE-NEXT: movdqa 48(%rdi), %xmm0
; SSE-NEXT: movdqa 16(%rsi), %xmm7
-; SSE-NEXT: movdqa 32(%rsi), %xmm6
-; SSE-NEXT: movdqa 48(%rsi), %xmm10
-; SSE-NEXT: movdqa 32(%rdx), %xmm9
-; SSE-NEXT: movdqa 48(%rdx), %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 32(%rsi), %xmm8
+; SSE-NEXT: movdqa 48(%rsi), %xmm11
+; SSE-NEXT: movdqa 32(%rdx), %xmm10
+; SSE-NEXT: movdqa 48(%rdx), %xmm12
+; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2]
-; SSE-NEXT: movdqa %xmm0, %xmm12
+; SSE-NEXT: movdqa %xmm0, %xmm9
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
; SSE-NEXT: movdqa %xmm5, %xmm3
; SSE-NEXT: pandn %xmm2, %xmm3
; SSE-NEXT: por %xmm1, %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535]
; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: por %xmm3, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
; SSE-NEXT: movdqa %xmm5, %xmm3
; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm12, %xmm1
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
+; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5]
-; SSE-NEXT: pand %xmm5, %xmm15
-; SSE-NEXT: por %xmm3, %xmm15
-; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,7,5,4,5]
+; SSE-NEXT: pand %xmm5, %xmm9
+; SSE-NEXT: por %xmm3, %xmm9
+; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2]
; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
; SSE-NEXT: movdqa %xmm5, %xmm4
; SSE-NEXT: pandn %xmm3, %xmm4
; SSE-NEXT: por %xmm1, %xmm4
; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2]
-; SSE-NEXT: movdqa %xmm2, %xmm13
-; SSE-NEXT: pandn %xmm1, %xmm13
-; SSE-NEXT: por %xmm4, %xmm13
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0]
+; SSE-NEXT: movdqa %xmm10, %xmm3
+; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2]
+; SSE-NEXT: movdqa %xmm2, %xmm10
+; SSE-NEXT: pandn %xmm1, %xmm10
+; SSE-NEXT: por %xmm4, %xmm10
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0]
; SSE-NEXT: movdqa %xmm5, %xmm3
; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm11, %xmm1
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,7,5,4,5]
-; SSE-NEXT: pand %xmm5, %xmm9
-; SSE-NEXT: por %xmm3, %xmm9
+; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,7,5,4,5]
+; SSE-NEXT: pand %xmm5, %xmm13
+; SSE-NEXT: por %xmm3, %xmm13
+; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
; SSE-NEXT: movdqa %xmm5, %xmm3
; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,2,2]
+; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2]
; SSE-NEXT: pand %xmm5, %xmm1
; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: movdqa 16(%rdx), %xmm11
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,2]
+; SSE-NEXT: movdqa 16(%rdx), %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
; SSE-NEXT: movdqa %xmm2, %xmm14
; SSE-NEXT: pandn %xmm1, %xmm14
; SSE-NEXT: pand %xmm2, %xmm3
; SSE-NEXT: por %xmm3, %xmm14
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0]
; SSE-NEXT: movdqa %xmm5, %xmm3
; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm8, %xmm1
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
+; SSE-NEXT: movdqa %xmm6, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,5,4,5]
-; SSE-NEXT: pand %xmm5, %xmm4
-; SSE-NEXT: por %xmm3, %xmm4
-; SSE-NEXT: movdqa (%rsi), %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5]
+; SSE-NEXT: pand %xmm5, %xmm15
+; SSE-NEXT: por %xmm3, %xmm15
+; SSE-NEXT: movdqa (%rsi), %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
; SSE-NEXT: movdqa %xmm5, %xmm0
; SSE-NEXT: pandn %xmm3, %xmm0
-; SSE-NEXT: movdqa (%rdi), %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,2,2]
+; SSE-NEXT: movdqa (%rdi), %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2]
; SSE-NEXT: pand %xmm5, %xmm3
; SSE-NEXT: por %xmm3, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
@@ -690,154 +693,154 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.ve
; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,2,2]
; SSE-NEXT: pandn %xmm12, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm8, %xmm0
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: movdqa %xmm6, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,5]
; SSE-NEXT: pand %xmm5, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0]
-; SSE-NEXT: pandn %xmm7, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,0,0]
+; SSE-NEXT: pandn %xmm12, %xmm5
; SSE-NEXT: por %xmm0, %xmm5
-; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT: # xmm0 = mem[2,2,3,3]
+; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT: # xmm1 = mem[2,2,3,3]
; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: movdqa %xmm12, %xmm7
-; SSE-NEXT: pandn %xmm0, %xmm7
-; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,1,3,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[2,1,1,0,4,5,6,7]
-; SSE-NEXT: pand %xmm12, %xmm10
-; SSE-NEXT: por %xmm7, %xmm10
+; SSE-NEXT: movdqa %xmm12, %xmm0
+; SSE-NEXT: pandn %xmm1, %xmm0
+; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,3,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[2,1,1,0,4,5,6,7]
+; SSE-NEXT: pand %xmm12, %xmm11
+; SSE-NEXT: por %xmm0, %xmm11
; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[2,2,3,3]
-; SSE-NEXT: movdqa %xmm12, %xmm7
-; SSE-NEXT: pandn %xmm0, %xmm7
-; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,1,3,3]
+; SSE-NEXT: movdqa %xmm12, %xmm1
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,3,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7]
; SSE-NEXT: pand %xmm12, %xmm0
-; SSE-NEXT: por %xmm7, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,3,3]
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3]
; SSE-NEXT: movdqa %xmm12, %xmm7
-; SSE-NEXT: pandn %xmm11, %xmm7
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,1,0,4,5,6,7]
-; SSE-NEXT: pand %xmm12, %xmm6
-; SSE-NEXT: por %xmm7, %xmm6
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; SSE-NEXT: pandn %xmm1, %xmm7
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7]
; SSE-NEXT: pand %xmm12, %xmm1
+; SSE-NEXT: por %xmm7, %xmm1
+; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,0,4,5,6,7]
+; SSE-NEXT: pand %xmm12, %xmm4
; SSE-NEXT: pandn %xmm3, %xmm12
-; SSE-NEXT: por %xmm1, %xmm12
+; SSE-NEXT: por %xmm4, %xmm12
; SSE-NEXT: movdqa %xmm12, 32(%rcx)
-; SSE-NEXT: movdqa %xmm6, 80(%rcx)
+; SSE-NEXT: movdqa %xmm1, 80(%rcx)
; SSE-NEXT: movdqa %xmm0, 128(%rcx)
-; SSE-NEXT: movdqa %xmm10, 176(%rcx)
+; SSE-NEXT: movdqa %xmm11, 176(%rcx)
; SSE-NEXT: movdqa %xmm5, (%rcx)
; SSE-NEXT: movdqa %xmm2, 16(%rcx)
-; SSE-NEXT: movdqa %xmm4, 48(%rcx)
+; SSE-NEXT: movdqa %xmm15, 48(%rcx)
; SSE-NEXT: movdqa %xmm14, 64(%rcx)
-; SSE-NEXT: movdqa %xmm9, 96(%rcx)
-; SSE-NEXT: movdqa %xmm13, 112(%rcx)
-; SSE-NEXT: movdqa %xmm15, 144(%rcx)
+; SSE-NEXT: movdqa %xmm13, 96(%rcx)
+; SSE-NEXT: movdqa %xmm10, 112(%rcx)
+; SSE-NEXT: movdqa %xmm9, 144(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 160(%rcx)
; SSE-NEXT: retq
;
; AVX1-LABEL: vf32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm12
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm15
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm9
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,2]
-; AVX1-NEXT: vmovdqa (%rsi), %xmm13
-; AVX1-NEXT: vmovdqa 16(%rsi), %xmm8
-; AVX1-NEXT: vmovdqa 32(%rsi), %xmm6
-; AVX1-NEXT: vmovdqa 48(%rsi), %xmm1
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
-; AVX1-NEXT: vmovdqa 16(%rdx), %xmm11
-; AVX1-NEXT: vmovdqa 32(%rdx), %xmm2
+; AVX1-NEXT: vmovdqa (%rdi), %xmm14
+; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm9
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,2,2]
+; AVX1-NEXT: vmovdqa (%rsi), %xmm2
+; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5
+; AVX1-NEXT: vmovdqa 32(%rsi), %xmm10
+; AVX1-NEXT: vmovdqa 48(%rsi), %xmm12
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7]
+; AVX1-NEXT: vmovdqa 16(%rdx), %xmm8
+; AVX1-NEXT: vmovdqa 32(%rdx), %xmm13
; AVX1-NEXT: vmovdqa 48(%rdx), %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2],xmm7[3],xmm3[4,5],xmm7[6],xmm3[7]
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4],xmm7[5],xmm3[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2],xmm7[3],xmm3[4,5],xmm7[6],xmm3[7]
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4],xmm7[5],xmm3[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[1,1,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2],xmm7[3],xmm3[4,5],xmm7[6],xmm3[7]
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[1,1,2,2]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm7[2],xmm3[3,4],xmm7[5],xmm3[6,7]
-; AVX1-NEXT: vmovdqa (%rdx), %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
-; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX1-NEXT: vpshufb %xmm10, %xmm7, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
-; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3],xmm14[4],xmm5[5,6],xmm14[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7]
-; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
-; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7]
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6],xmm7[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX1-NEXT: vpshufb %xmm10, %xmm7, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7]
-; AVX1-NEXT: vmovdqa %xmm3, (%rcx)
-; AVX1-NEXT: vmovdqa %xmm6, 32(%rcx)
-; AVX1-NEXT: vmovdqa %xmm4, 48(%rcx)
-; AVX1-NEXT: vmovdqa %xmm2, 80(%rcx)
-; AVX1-NEXT: vmovdqa %xmm1, 96(%rcx)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7]
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm11[2],xmm6[3,4],xmm11[5],xmm6[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7]
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[1,1,2,2]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3,4],xmm14[5],xmm11[6,7]
+; AVX1-NEXT: vmovdqa (%rdx), %xmm15
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm11[1,2],xmm14[3],xmm11[4,5],xmm14[6],xmm11[7]
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm14
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm4[2],xmm14[3,4],xmm4[5],xmm14[6,7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6],xmm11[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
+; AVX1-NEXT: vpshufb %xmm1, %xmm9, %xmm9
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
+; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX1-NEXT: vpshufb %xmm1, %xmm9, %xmm9
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
+; AVX1-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX1-NEXT: vmovdqa %xmm1, 32(%rcx)
+; AVX1-NEXT: vmovdqa %xmm3, 48(%rcx)
+; AVX1-NEXT: vmovdqa %xmm9, 80(%rcx)
+; AVX1-NEXT: vmovdqa %xmm7, 96(%rcx)
; AVX1-NEXT: vmovdqa %xmm0, 176(%rcx)
-; AVX1-NEXT: vmovdqa %xmm5, 128(%rcx)
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vmovaps %xmm0, 144(%rcx)
+; AVX1-NEXT: vmovdqa %xmm4, 128(%rcx)
+; AVX1-NEXT: vmovdqa %xmm14, 144(%rcx)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 16(%rcx)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -850,81 +853,81 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.ve
;
; AVX2-SLOW-LABEL: vf32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm9
-; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm11
-; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm14
+; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1
+; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4
+; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm0
; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2
; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm6
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <5,5,u,6,6,u,7,7>
-; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm13, %ymm6
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7>
+; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm5, %ymm6
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm10
+; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm3
; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6
-; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3
-; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm4
-; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7
-; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
-; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,3,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3,4],xmm7[5],xmm1[6,7]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm13, %ymm1
-; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm12
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,2,2]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[3,3,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7]
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
-; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm4, %ymm7
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,2]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3,4],xmm1[5],xmm7[6,7]
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
-; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm3
-; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm4
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,3,3,u,4,4,u,5>
-; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm5, %ymm6
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
-; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,u,3,3,u,4,4,u>
-; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm6, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm3
-; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm5, %ymm4
-; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
-; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm6, %ymm4
-; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3
-; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rcx)
+; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10
+; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11
+; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm12
+; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
+; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm7
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7
+; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm5, %ymm5
+; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[3,3,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,0,0,u,1,1,u,2>
+; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm7, %ymm9
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,2]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[3,3,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7]
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3]
+; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8
+; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm7, %ymm7
+; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm7, %ymm7
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
+; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,3,3,u,4,4,u,5>
+; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm9, %ymm10
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,3,3,u,4,4,u>
+; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm10, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm9, %ymm4
+; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1
+; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm10, %ymm0
+; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-SLOW-NEXT: vmovdqa %ymm2, 128(%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm12, 64(%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm10, 160(%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rcx)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -940,68 +943,68 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.ve
; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <u,0,0,u,1,1,u,2>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm11, %ymm7
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm4, %ymm7
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm9
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rsi), %xmm1
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm5, %xmm5
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rsi), %xmm9
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm11, %xmm10, %xmm10
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm9, %xmm9
; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3,4],xmm1[5],xmm7[6,7]
-; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm3, %xmm5
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm7
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7]
+; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm3, %xmm9
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm10
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rsi), %xmm6
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm11, %ymm10
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm3, %ymm10, %ymm11
-; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm5, %ymm8
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm1, %ymm8, %ymm1
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm4, %ymm4
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3
+; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm9, %ymm4
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm11, %xmm7, %xmm7
; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm6, %xmm2
; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %ymm6
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
-; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7]
-; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm5, %ymm3
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm6, %ymm4
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <u,3,3,u,4,4,u,5>
-; AVX2-FAST-ALL-NEXT: vpermd 32(%rdi), %ymm5, %ymm6
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm6
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm6, %ymm3
-; AVX2-FAST-ALL-NEXT: vpermd (%rdi), %ymm5, %ymm5
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm3, %ymm5, %ymm3
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,3,3,u,4,4,u>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm5, %ymm6
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm5, %ymm0
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
+; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2
+; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm9, %ymm5
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm6, %ymm6
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = <u,3,3,u,4,4,u,5>
+; AVX2-FAST-ALL-NEXT: vpermd 32(%rdi), %ymm7, %ymm8
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm8
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm8, %ymm5
+; AVX2-FAST-ALL-NEXT: vpermd (%rdi), %ymm7, %ymm7
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,3,3,u,4,4,u>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm7, %ymm8
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm7, %ymm0
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm9, %ymm5, %ymm0, %ymm0
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, 128(%rcx)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, 128(%rcx)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, 64(%rcx)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm11, 96(%rcx)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 160(%rcx)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, (%rcx)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, 96(%rcx)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, 160(%rcx)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX2-FAST-ALL-NEXT: retq
;
@@ -1017,68 +1020,68 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.ve
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <u,0,0,u,1,1,u,2>
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm11, %ymm7
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm7
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm9
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm9
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3,4],xmm1[5],xmm7[6,7]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm7
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm9
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7>
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm11, %ymm10
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm10, %ymm11
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm5, %ymm8
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm8, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7>
+; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm4, %ymm4
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3
+; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm9, %ymm4
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm5, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <u,3,3,u,4,4,u,5>
-; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm5, %ymm6
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm5, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm5, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,3,3,u,4,4,u>
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm5, %ymm6
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm5, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2
+; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm9, %ymm5
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <u,3,3,u,4,4,u,5>
+; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm7, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm8
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm8, %ymm5
+; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm7, %ymm7
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,3,3,u,4,4,u>
+; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm7, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm7, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 128(%rcx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 128(%rcx)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 96(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rcx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 96(%rcx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%rcx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
index c7740d4376c3e..33ba42a8238b9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
@@ -276,148 +276,148 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: movdqa 16(%rdi), %xmm1
; SSE-NEXT: movdqa (%rsi), %xmm5
-; SSE-NEXT: movdqa 16(%rsi), %xmm8
-; SSE-NEXT: movdqa (%rdx), %xmm3
+; SSE-NEXT: movdqa 16(%rsi), %xmm6
+; SSE-NEXT: movdqa (%rdx), %xmm7
; SSE-NEXT: movdqa 16(%rdx), %xmm4
-; SSE-NEXT: movdqa (%rcx), %xmm6
+; SSE-NEXT: movdqa (%rcx), %xmm8
; SSE-NEXT: movdqa 16(%rcx), %xmm9
-; SSE-NEXT: movdqa %xmm3, %xmm7
-; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; SSE-NEXT: movdqa %xmm7, %xmm10
+; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE-NEXT: movdqa %xmm2, %xmm10
-; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: movdqa %xmm4, %xmm3
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
-; SSE-NEXT: movdqa %xmm1, %xmm6
-; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
-; SSE-NEXT: movdqa %xmm6, %xmm7
-; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE-NEXT: movdqa %xmm4, %xmm7
+; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSE-NEXT: movdqa %xmm1, 96(%r8)
-; SSE-NEXT: movdqa %xmm3, 112(%r8)
-; SSE-NEXT: movdqa %xmm6, 64(%r8)
-; SSE-NEXT: movdqa %xmm7, 80(%r8)
+; SSE-NEXT: movdqa %xmm6, 112(%r8)
+; SSE-NEXT: movdqa %xmm8, 64(%r8)
+; SSE-NEXT: movdqa %xmm10, 80(%r8)
; SSE-NEXT: movdqa %xmm0, 32(%r8)
; SSE-NEXT: movdqa %xmm5, 48(%r8)
; SSE-NEXT: movdqa %xmm2, (%r8)
-; SSE-NEXT: movdqa %xmm10, 16(%r8)
+; SSE-NEXT: movdqa %xmm3, 16(%r8)
; SSE-NEXT: retq
;
; AVX1-LABEL: vf16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rcx), %xmm8
+; AVX1-NEXT: vmovdqa (%rcx), %xmm0
; AVX1-NEXT: vmovdqa 16(%rcx), %xmm5
-; AVX1-NEXT: vmovdqa (%rdx), %xmm9
+; AVX1-NEXT: vmovdqa (%rdx), %xmm1
; AVX1-NEXT: vmovdqa 16(%rdx), %xmm6
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3
; AVX1-NEXT: vmovdqa (%rsi), %xmm2
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm7
; AVX1-NEXT: vmovdqa (%rdi), %xmm4
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm0
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm1
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm8
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4],ymm3[5],ymm9[6],ymm3[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
-; AVX1-NEXT: vmovaps %ymm2, (%r8)
-; AVX1-NEXT: vmovaps %ymm0, 96(%r8)
-; AVX1-NEXT: vmovaps %ymm1, 64(%r8)
-; AVX1-NEXT: vmovaps %ymm11, 32(%r8)
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX1-NEXT: vmovaps %ymm0, (%r8)
+; AVX1-NEXT: vmovaps %ymm5, 96(%r8)
+; AVX1-NEXT: vmovaps %ymm9, 64(%r8)
+; AVX1-NEXT: vmovaps %ymm3, 32(%r8)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: vf16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rcx), %xmm8
+; AVX2-NEXT: vmovdqa (%rcx), %xmm0
; AVX2-NEXT: vmovdqa 16(%rcx), %xmm5
-; AVX2-NEXT: vmovdqa (%rdx), %xmm9
+; AVX2-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-NEXT: vmovdqa 16(%rdx), %xmm6
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm3
; AVX2-NEXT: vmovdqa (%rsi), %xmm2
; AVX2-NEXT: vmovdqa 16(%rsi), %xmm7
; AVX2-NEXT: vmovdqa (%rdi), %xmm4
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm0
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm8
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4],ymm3[5],ymm9[6],ymm3[7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
-; AVX2-NEXT: vmovdqa %ymm2, (%r8)
-; AVX2-NEXT: vmovdqa %ymm0, 96(%r8)
-; AVX2-NEXT: vmovdqa %ymm1, 64(%r8)
-; AVX2-NEXT: vmovdqa %ymm11, 32(%r8)
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT: vmovdqa %ymm0, (%r8)
+; AVX2-NEXT: vmovdqa %ymm5, 96(%r8)
+; AVX2-NEXT: vmovdqa %ymm9, 64(%r8)
+; AVX2-NEXT: vmovdqa %ymm3, 32(%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -453,297 +453,293 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
; SSE-LABEL: vf32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm10
-; SSE-NEXT: movdqa 16(%rdi), %xmm13
-; SSE-NEXT: movdqa 32(%rdi), %xmm8
-; SSE-NEXT: movdqa 48(%rdi), %xmm4
-; SSE-NEXT: movdqa (%rsi), %xmm3
-; SSE-NEXT: movdqa 16(%rsi), %xmm1
+; SSE-NEXT: movdqa (%rdi), %xmm5
+; SSE-NEXT: movdqa 16(%rdi), %xmm11
+; SSE-NEXT: movdqa 32(%rdi), %xmm4
+; SSE-NEXT: movdqa 48(%rdi), %xmm2
+; SSE-NEXT: movdqa (%rsi), %xmm0
+; SSE-NEXT: movdqa 16(%rsi), %xmm3
; SSE-NEXT: movdqa 32(%rsi), %xmm9
-; SSE-NEXT: movdqa (%rdx), %xmm0
-; SSE-NEXT: movdqa 16(%rdx), %xmm5
-; SSE-NEXT: movdqa 32(%rdx), %xmm6
-; SSE-NEXT: movdqa (%rcx), %xmm11
+; SSE-NEXT: movdqa (%rdx), %xmm7
+; SSE-NEXT: movdqa 16(%rdx), %xmm13
+; SSE-NEXT: movdqa 32(%rdx), %xmm10
+; SSE-NEXT: movdqa (%rcx), %xmm8
; SSE-NEXT: movdqa 16(%rcx), %xmm14
; SSE-NEXT: movdqa 32(%rcx), %xmm12
-; SSE-NEXT: movdqa %xmm0, %xmm7
-; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; SSE-NEXT: movdqa %xmm10, %xmm15
-; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3]
-; SSE-NEXT: movdqa %xmm15, %xmm2
-; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
-; SSE-NEXT: movdqa %xmm10, %xmm2
-; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm7, %xmm15
+; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
+; SSE-NEXT: movdqa %xmm5, %xmm6
+; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; SSE-NEXT: movdqa %xmm6, %xmm1
+; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
; SSE-NEXT: movdqa %xmm5, %xmm0
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE-NEXT: movdqa %xmm13, %xmm7
-; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
-; SSE-NEXT: movdqa %xmm7, %xmm2
-; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
-; SSE-NEXT: movdqa %xmm13, %xmm11
-; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
-; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
-; SSE-NEXT: movdqa %xmm8, %xmm5
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3]
-; SSE-NEXT: movdqa %xmm5, %xmm14
-; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; SSE-NEXT: movdqa 48(%rdx), %xmm0
-; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
+; SSE-NEXT: movdqa %xmm13, %xmm15
+; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; SSE-NEXT: movdqa %xmm11, %xmm7
+; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; SSE-NEXT: movdqa %xmm7, %xmm0
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
+; SSE-NEXT: movdqa %xmm11, %xmm8
+; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1]
+; SSE-NEXT: movdqa %xmm10, %xmm15
+; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; SSE-NEXT: movdqa %xmm4, %xmm13
+; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3]
+; SSE-NEXT: movdqa %xmm13, %xmm14
+; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
+; SSE-NEXT: movdqa 48(%rdx), %xmm15
+; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
; SSE-NEXT: movdqa 48(%rcx), %xmm12
-; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
-; SSE-NEXT: movdqa %xmm8, %xmm9
-; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; SSE-NEXT: movdqa %xmm0, %xmm6
-; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
-; SSE-NEXT: movdqa 48(%rsi), %xmm2
-; SSE-NEXT: movdqa %xmm4, %xmm3
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSE-NEXT: movdqa %xmm4, %xmm2
-; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT: movdqa %xmm4, 224(%r8)
-; SSE-NEXT: movdqa %xmm2, 240(%r8)
+; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
+; SSE-NEXT: movdqa %xmm4, %xmm9
+; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1]
+; SSE-NEXT: movdqa %xmm15, %xmm10
+; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
+; SSE-NEXT: movdqa 48(%rsi), %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1]
+; SSE-NEXT: movdqa %xmm2, 224(%r8)
+; SSE-NEXT: movdqa %xmm1, 240(%r8)
; SSE-NEXT: movdqa %xmm3, 192(%r8)
-; SSE-NEXT: movdqa %xmm1, 208(%r8)
-; SSE-NEXT: movdqa %xmm8, 160(%r8)
+; SSE-NEXT: movdqa %xmm0, 208(%r8)
+; SSE-NEXT: movdqa %xmm4, 160(%r8)
; SSE-NEXT: movdqa %xmm9, 176(%r8)
-; SSE-NEXT: movdqa %xmm5, 128(%r8)
+; SSE-NEXT: movdqa %xmm13, 128(%r8)
; SSE-NEXT: movdqa %xmm14, 144(%r8)
-; SSE-NEXT: movdqa %xmm13, 96(%r8)
-; SSE-NEXT: movdqa %xmm11, 112(%r8)
+; SSE-NEXT: movdqa %xmm11, 96(%r8)
+; SSE-NEXT: movdqa %xmm8, 112(%r8)
; SSE-NEXT: movdqa %xmm7, 64(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 80(%r8)
-; SSE-NEXT: movdqa %xmm10, 32(%r8)
+; SSE-NEXT: movdqa %xmm5, 32(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 48(%r8)
-; SSE-NEXT: movdqa %xmm15, (%r8)
+; SSE-NEXT: movdqa %xmm6, (%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 16(%r8)
; SSE-NEXT: retq
;
; AVX1-LABEL: vf32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rcx), %xmm12
-; AVX1-NEXT: vmovdqa 16(%rcx), %xmm15
-; AVX1-NEXT: vmovdqa 32(%rcx), %xmm3
-; AVX1-NEXT: vmovdqa 48(%rcx), %xmm11
-; AVX1-NEXT: vmovdqa (%rdx), %xmm13
+; AVX1-NEXT: vmovdqa (%rcx), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rcx), %xmm4
+; AVX1-NEXT: vmovdqa 32(%rcx), %xmm9
+; AVX1-NEXT: vmovdqa 48(%rcx), %xmm8
+; AVX1-NEXT: vmovdqa (%rdx), %xmm1
; AVX1-NEXT: vmovdqa 16(%rdx), %xmm6
-; AVX1-NEXT: vmovdqa 32(%rdx), %xmm7
-; AVX1-NEXT: vmovdqa 48(%rdx), %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1]
+; AVX1-NEXT: vmovdqa 32(%rdx), %xmm10
+; AVX1-NEXT: vmovdqa 48(%rdx), %xmm11
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm8
-; AVX1-NEXT: vmovdqa (%rsi), %xmm14
-; AVX1-NEXT: vmovdqa 48(%rsi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3
+; AVX1-NEXT: vmovdqa (%rsi), %xmm2
+; AVX1-NEXT: vmovdqa 48(%rsi), %xmm12
; AVX1-NEXT: vmovdqa (%rdi), %xmm5
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm8
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm13
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm14, %ymm7
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm14, %ymm7
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14
+; AVX1-NEXT: vmovdqa 32(%rsi), %xmm15
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2],ymm7[3],ymm14[4],ymm7[5],ymm14[6],ymm7[7]
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm14
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
+; AVX1-NEXT: vmovdqa 16(%rsi), %xmm13
+; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7]
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm12
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm10[0],zero,xmm10[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14
+; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2],ymm10[3],ymm14[4],ymm10[5],ymm14[6],ymm10[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0
-; AVX1-NEXT: vmovdqa 32(%rsi), %xmm10
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm0
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
-; AVX1-NEXT: vmovaps %ymm2, (%r8)
-; AVX1-NEXT: vmovaps %ymm1, 96(%r8)
-; AVX1-NEXT: vmovaps %ymm0, 64(%r8)
-; AVX1-NEXT: vmovaps %ymm3, 160(%r8)
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX1-NEXT: vmovaps %ymm0, (%r8)
+; AVX1-NEXT: vmovaps %ymm4, 96(%r8)
+; AVX1-NEXT: vmovaps %ymm10, 64(%r8)
+; AVX1-NEXT: vmovaps %ymm9, 160(%r8)
; AVX1-NEXT: vmovaps %ymm11, 128(%r8)
; AVX1-NEXT: vmovaps %ymm8, 224(%r8)
-; AVX1-NEXT: vmovaps %ymm9, 192(%r8)
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm0, 32(%r8)
+; AVX1-NEXT: vmovaps %ymm7, 192(%r8)
+; AVX1-NEXT: vmovaps %ymm3, 32(%r8)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: vf32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rcx), %xmm12
-; AVX2-NEXT: vmovdqa 16(%rcx), %xmm15
-; AVX2-NEXT: vmovdqa 32(%rcx), %xmm3
-; AVX2-NEXT: vmovdqa 48(%rcx), %xmm11
-; AVX2-NEXT: vmovdqa (%rdx), %xmm13
+; AVX2-NEXT: vmovdqa (%rcx), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rcx), %xmm4
+; AVX2-NEXT: vmovdqa 32(%rcx), %xmm9
+; AVX2-NEXT: vmovdqa 48(%rcx), %xmm8
+; AVX2-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-NEXT: vmovdqa 16(%rdx), %xmm6
-; AVX2-NEXT: vmovdqa 32(%rdx), %xmm7
-; AVX2-NEXT: vmovdqa 48(%rdx), %xmm1
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1]
+; AVX2-NEXT: vmovdqa 32(%rdx), %xmm10
+; AVX2-NEXT: vmovdqa 48(%rdx), %xmm11
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm8
-; AVX2-NEXT: vmovdqa (%rsi), %xmm14
-; AVX2-NEXT: vmovdqa 48(%rsi), %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm3
+; AVX2-NEXT: vmovdqa (%rsi), %xmm2
+; AVX2-NEXT: vmovdqa 48(%rsi), %xmm12
; AVX2-NEXT: vmovdqa (%rdi), %xmm5
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm4
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm8
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vmovdqa 48(%rdi), %xmm13
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm14, %ymm7
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,0,1,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm14, %ymm7
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14
+; AVX2-NEXT: vmovdqa 32(%rsi), %xmm15
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2],ymm7[3],ymm14[4],ymm7[5],ymm14[6],ymm7[7]
+; AVX2-NEXT: vmovdqa 32(%rdi), %xmm14
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,0,1,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12
+; AVX2-NEXT: vmovdqa 16(%rsi), %xmm13
+; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7]
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm12
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm10[0],zero,xmm10[1],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm14, %ymm10
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,0,1,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm14, %ymm10
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14
+; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2],ymm10[3],ymm14[4],ymm10[5],ymm14[6],ymm10[7]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm12, %ymm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0
-; AVX2-NEXT: vmovdqa 32(%rsi), %xmm10
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
-; AVX2-NEXT: vmovdqa 32(%rdi), %xmm0
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX2-NEXT: vmovdqa 16(%rsi), %xmm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
-; AVX2-NEXT: vmovdqa %ymm2, (%r8)
-; AVX2-NEXT: vmovdqa %ymm1, 96(%r8)
-; AVX2-NEXT: vmovdqa %ymm0, 64(%r8)
-; AVX2-NEXT: vmovdqa %ymm3, 160(%r8)
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT: vmovdqa %ymm0, (%r8)
+; AVX2-NEXT: vmovdqa %ymm4, 96(%r8)
+; AVX2-NEXT: vmovdqa %ymm10, 64(%r8)
+; AVX2-NEXT: vmovdqa %ymm9, 160(%r8)
; AVX2-NEXT: vmovdqa %ymm11, 128(%r8)
; AVX2-NEXT: vmovdqa %ymm8, 224(%r8)
-; AVX2-NEXT: vmovdqa %ymm9, 192(%r8)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
+; AVX2-NEXT: vmovdqa %ymm7, 192(%r8)
+; AVX2-NEXT: vmovdqa %ymm3, 32(%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 045bc8805337b..fb58a5c62bbb8 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -294,91 +294,91 @@ define void @vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecp
define void @vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
; SSE-LABEL: vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm13
-; SSE-NEXT: movdqa (%rsi), %xmm11
-; SSE-NEXT: movdqa (%rdx), %xmm10
-; SSE-NEXT: movdqa (%rcx), %xmm12
-; SSE-NEXT: movdqa (%r8), %xmm8
+; SSE-NEXT: movdqa (%rdi), %xmm4
+; SSE-NEXT: movdqa (%rsi), %xmm7
+; SSE-NEXT: movdqa (%rdx), %xmm2
+; SSE-NEXT: movdqa (%rcx), %xmm3
+; SSE-NEXT: movdqa (%r8), %xmm6
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm13, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; SSE-NEXT: pand %xmm0, %xmm4
-; SSE-NEXT: por %xmm1, %xmm4
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,0,65535,65535]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,2,2]
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,0]
-; SSE-NEXT: pand %xmm3, %xmm5
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pandn %xmm7, %xmm1
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pandn %xmm4, %xmm2
-; SSE-NEXT: por %xmm1, %xmm2
+; SSE-NEXT: pandn %xmm4, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
+; SSE-NEXT: pand %xmm0, %xmm5
+; SSE-NEXT: por %xmm1, %xmm5
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,0]
+; SSE-NEXT: pand %xmm1, %xmm9
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm1, %xmm11
+; SSE-NEXT: pandn %xmm10, %xmm11
+; SSE-NEXT: por %xmm9, %xmm11
+; SSE-NEXT: pand %xmm8, %xmm11
+; SSE-NEXT: pandn %xmm5, %xmm8
+; SSE-NEXT: por %xmm11, %xmm8
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: pand %xmm5, %xmm2
-; SSE-NEXT: pandn %xmm8, %xmm5
-; SSE-NEXT: por %xmm2, %xmm5
-; SSE-NEXT: movdqa %xmm10, %xmm1
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,0,65535,65535,65535,0]
-; SSE-NEXT: movdqa %xmm13, %xmm4
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,1,3,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1]
-; SSE-NEXT: pand %xmm2, %xmm7
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm7, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
-; SSE-NEXT: pandn %xmm9, %xmm0
-; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm13, %xmm1
-; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
-; SSE-NEXT: psrlq $48, %xmm11
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm11[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm2, %xmm7
-; SSE-NEXT: pandn %xmm1, %xmm7
-; SSE-NEXT: movdqa %xmm10, %xmm1
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,7,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
-; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: por %xmm7, %xmm6
-; SSE-NEXT: pand %xmm3, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3]
-; SSE-NEXT: pandn %xmm7, %xmm3
-; SSE-NEXT: por %xmm6, %xmm3
-; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[0,2,3,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,2]
-; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,65535,65535]
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: pandn %xmm7, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535]
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm9, %xmm2
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm2, 16(%r9)
-; SSE-NEXT: movdqa %xmm1, 48(%r9)
-; SSE-NEXT: movdqa %xmm3, 64(%r9)
+; SSE-NEXT: pand %xmm5, %xmm8
+; SSE-NEXT: pandn %xmm6, %xmm5
+; SSE-NEXT: por %xmm8, %xmm5
+; SSE-NEXT: movdqa %xmm2, %xmm8
+; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1]
+; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0]
+; SSE-NEXT: movdqa %xmm4, %xmm9
+; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,3,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,1]
+; SSE-NEXT: pand %xmm10, %xmm11
+; SSE-NEXT: pandn %xmm8, %xmm10
+; SSE-NEXT: por %xmm11, %xmm10
+; SSE-NEXT: pand %xmm0, %xmm10
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1]
+; SSE-NEXT: pandn %xmm8, %xmm0
+; SSE-NEXT: por %xmm10, %xmm0
+; SSE-NEXT: movdqa %xmm4, %xmm10
+; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
+; SSE-NEXT: psrlq $48, %xmm7
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm7[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm7, %xmm11
+; SSE-NEXT: pandn %xmm10, %xmm11
+; SSE-NEXT: movdqa %xmm2, %xmm10
+; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,7,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,3,3]
+; SSE-NEXT: pand %xmm7, %xmm12
+; SSE-NEXT: por %xmm11, %xmm12
+; SSE-NEXT: pand %xmm1, %xmm12
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; SSE-NEXT: pandn %xmm6, %xmm1
+; SSE-NEXT: por %xmm12, %xmm1
+; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; SSE-NEXT: pand %xmm7, %xmm4
+; SSE-NEXT: pandn %xmm10, %xmm7
+; SSE-NEXT: por %xmm4, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,65535]
+; SSE-NEXT: pand %xmm4, %xmm7
+; SSE-NEXT: pandn %xmm6, %xmm4
+; SSE-NEXT: por %xmm7, %xmm4
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,6,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535]
+; SSE-NEXT: pand %xmm3, %xmm2
+; SSE-NEXT: pandn %xmm8, %xmm3
+; SSE-NEXT: por %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm3, 16(%r9)
+; SSE-NEXT: movdqa %xmm4, 48(%r9)
+; SSE-NEXT: movdqa %xmm1, 64(%r9)
; SSE-NEXT: movdqa %xmm0, (%r9)
; SSE-NEXT: movdqa %xmm5, 32(%r9)
; SSE-NEXT: retq
@@ -392,36 +392,36 @@ define void @vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecp
; AVX1-NEXT: vmovdqa (%r8), %xmm1
; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm2
; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm2[1]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,7,6]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,2,1]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,1,3,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,1,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6],xmm7[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3],xmm6[4,5],xmm2[6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3,4,5],xmm7[6],xmm2[7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm0[4],xmm6[5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,3,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3],xmm9[4,5,6],xmm7[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3],xmm10[4,5],xmm8[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm0[4],xmm9[5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2,3],xmm6[4,5],xmm4[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1,2,3],xmm9[4,5],xmm4[6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm1[3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
@@ -430,9 +430,9 @@ define void @vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecp
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7]
; AVX1-NEXT: vmovdqa %xmm0, 48(%r9)
; AVX1-NEXT: vmovdqa %xmm4, 32(%r9)
-; AVX1-NEXT: vmovdqa %xmm2, 16(%r9)
-; AVX1-NEXT: vmovdqa %xmm10, (%r9)
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm1[2],xmm8[3,4,5,6],xmm1[7]
+; AVX1-NEXT: vmovdqa %xmm8, 16(%r9)
+; AVX1-NEXT: vmovdqa %xmm7, (%r9)
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7]
; AVX1-NEXT: vmovdqa %xmm0, 64(%r9)
; AVX1-NEXT: retq
;
@@ -519,324 +519,323 @@ define void @vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecp
define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
; SSE-LABEL: vf16:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: movdqa (%rdi), %xmm3
+; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm5
-; SSE-NEXT: movdqa (%rsi), %xmm6
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rsi), %xmm8
+; SSE-NEXT: movdqa (%rsi), %xmm15
+; SSE-NEXT: movdqa 16(%rsi), %xmm13
; SSE-NEXT: movdqa 16(%rdx), %xmm10
-; SSE-NEXT: movdqa (%rcx), %xmm12
-; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rcx), %xmm13
-; SSE-NEXT: movdqa 16(%r8), %xmm15
-; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm11, %xmm1
-; SSE-NEXT: pandn %xmm5, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7]
+; SSE-NEXT: movdqa (%rcx), %xmm14
+; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 16(%rcx), %xmm11
+; SSE-NEXT: movdqa 16(%r8), %xmm8
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: pandn %xmm5, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
-; SSE-NEXT: pand %xmm11, %xmm2
-; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535]
-; SSE-NEXT: movdqa %xmm1, %xmm7
-; SSE-NEXT: pandn %xmm2, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,2,2]
-; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,65535,65535,0]
-; SSE-NEXT: pand %xmm14, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm14, %xmm4
-; SSE-NEXT: pandn %xmm9, %xmm4
-; SSE-NEXT: por %xmm0, %xmm4
-; SSE-NEXT: pand %xmm1, %xmm4
-; SSE-NEXT: por %xmm7, %xmm4
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: pandn %xmm15, %xmm0
-; SSE-NEXT: por %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm11, %xmm0
-; SSE-NEXT: pandn %xmm3, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; SSE-NEXT: pand %xmm11, %xmm4
-; SSE-NEXT: por %xmm0, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm14, %xmm7
-; SSE-NEXT: pandn %xmm0, %xmm7
-; SSE-NEXT: movdqa (%rdx), %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
-; SSE-NEXT: pand %xmm14, %xmm0
-; SSE-NEXT: por %xmm0, %xmm7
-; SSE-NEXT: pand %xmm1, %xmm7
-; SSE-NEXT: pandn %xmm4, %xmm1
-; SSE-NEXT: movdqa (%r8), %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: por %xmm7, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: por %xmm1, %xmm2
+; SSE-NEXT: pand %xmm3, %xmm2
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,0,0,65535,65535]
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: pandn %xmm2, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,2,2]
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0]
+; SSE-NEXT: pand %xmm9, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm9, %xmm12
+; SSE-NEXT: pandn %xmm7, %xmm12
+; SSE-NEXT: por %xmm6, %xmm12
+; SSE-NEXT: pand %xmm0, %xmm12
+; SSE-NEXT: por %xmm4, %xmm12
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: pand %xmm3, %xmm12
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: pandn %xmm8, %xmm2
+; SSE-NEXT: por %xmm12, %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm5, %xmm12
-; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
-; SSE-NEXT: psrlq $48, %xmm8
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm0, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm2, %xmm7
; SSE-NEXT: pandn %xmm1, %xmm7
-; SSE-NEXT: movdqa %xmm10, %xmm1
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,7,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
-; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: por %xmm7, %xmm6
-; SSE-NEXT: pand %xmm14, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3]
-; SSE-NEXT: movdqa %xmm14, %xmm2
-; SSE-NEXT: pandn %xmm4, %xmm2
-; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm15[3,3,3,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm15, %xmm4
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4]
+; SSE-NEXT: pand %xmm2, %xmm12
+; SSE-NEXT: por %xmm7, %xmm12
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm9, %xmm14
+; SSE-NEXT: pandn %xmm7, %xmm14
+; SSE-NEXT: movdqa (%rdx), %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; SSE-NEXT: movdqa %xmm0, %xmm7
-; SSE-NEXT: pandn %xmm1, %xmm7
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,3,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,2]
-; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: por %xmm7, %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm9, %xmm1
-; SSE-NEXT: pandn %xmm4, %xmm1
-; SSE-NEXT: pand %xmm9, %xmm6
-; SSE-NEXT: por %xmm6, %xmm1
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; SSE-NEXT: movdqa %xmm13, %xmm6
-; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,2,2]
+; SSE-NEXT: pand %xmm9, %xmm15
+; SSE-NEXT: por %xmm15, %xmm14
+; SSE-NEXT: pand %xmm0, %xmm14
+; SSE-NEXT: pandn %xmm12, %xmm0
+; SSE-NEXT: movdqa (%r8), %xmm2
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: por %xmm14, %xmm0
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: pandn %xmm2, %xmm3
+; SSE-NEXT: por %xmm0, %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm5, %xmm12
+; SSE-NEXT: movdqa %xmm5, %xmm14
+; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; SSE-NEXT: psrlq $48, %xmm13
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm13[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm2, %xmm13
+; SSE-NEXT: pandn %xmm12, %xmm13
+; SSE-NEXT: movdqa %xmm10, %xmm12
+; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,7,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,3,3]
+; SSE-NEXT: pand %xmm2, %xmm15
+; SSE-NEXT: por %xmm13, %xmm15
+; SSE-NEXT: pand %xmm9, %xmm15
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3]
+; SSE-NEXT: movdqa %xmm9, %xmm0
+; SSE-NEXT: pandn %xmm1, %xmm0
+; SSE-NEXT: por %xmm15, %xmm0
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT: movdqa %xmm2, %xmm15
+; SSE-NEXT: pandn %xmm12, %xmm15
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm14[0,2,3,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,2]
+; SSE-NEXT: pand %xmm2, %xmm12
+; SSE-NEXT: por %xmm15, %xmm12
+; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535]
+; SSE-NEXT: movdqa %xmm15, %xmm14
+; SSE-NEXT: pandn %xmm1, %xmm14
+; SSE-NEXT: pand %xmm15, %xmm12
+; SSE-NEXT: por %xmm12, %xmm14
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,6,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE-NEXT: movdqa %xmm11, %xmm12
+; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,0,65535]
-; SSE-NEXT: pand %xmm12, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,0,1]
-; SSE-NEXT: movdqa %xmm12, %xmm15
-; SSE-NEXT: pandn %xmm1, %xmm15
-; SSE-NEXT: por %xmm6, %xmm15
-; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,1]
-; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,0,65535,65535,65535,0]
-; SSE-NEXT: movdqa %xmm13, %xmm6
-; SSE-NEXT: pandn %xmm10, %xmm6
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1]
-; SSE-NEXT: pand %xmm13, %xmm5
-; SSE-NEXT: por %xmm6, %xmm5
-; SSE-NEXT: movdqa %xmm11, %xmm10
-; SSE-NEXT: pandn %xmm1, %xmm10
-; SSE-NEXT: pand %xmm11, %xmm5
-; SSE-NEXT: por %xmm5, %xmm10
-; SSE-NEXT: movdqa %xmm3, %xmm8
-; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: pand %xmm12, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1]
+; SSE-NEXT: movdqa %xmm12, %xmm13
+; SSE-NEXT: pandn %xmm1, %xmm13
+; SSE-NEXT: por %xmm0, %xmm13
+; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,2,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
+; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0]
+; SSE-NEXT: movdqa %xmm10, %xmm11
+; SSE-NEXT: pandn %xmm0, %xmm11
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,3,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,1]
+; SSE-NEXT: pand %xmm10, %xmm0
+; SSE-NEXT: por %xmm11, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm3, %xmm7
+; SSE-NEXT: pandn %xmm1, %xmm7
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: por %xmm0, %xmm7
+; SSE-NEXT: movdqa %xmm6, %xmm0
+; SSE-NEXT: movdqa %xmm4, %xmm11
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT: movdqa %xmm6, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; SSE-NEXT: psrlq $48, %xmm11
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm11[1]
+; SSE-NEXT: movdqa %xmm2, %xmm11
+; SSE-NEXT: pandn %xmm1, %xmm11
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE-NEXT: psrlq $48, %xmm4
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm5, %xmm4
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm6, %xmm5
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5,7,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: pand %xmm14, %xmm3
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3]
-; SSE-NEXT: pandn %xmm4, %xmm14
-; SSE-NEXT: por %xmm3, %xmm14
-; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm5, %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pand %xmm9, %xmm0
-; SSE-NEXT: pandn %xmm4, %xmm9
-; SSE-NEXT: por %xmm0, %xmm9
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,6,6]
+; SSE-NEXT: movdqa %xmm4, %xmm1
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,7,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3]
+; SSE-NEXT: pand %xmm2, %xmm8
+; SSE-NEXT: por %xmm11, %xmm8
+; SSE-NEXT: pand %xmm9, %xmm8
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,2,3]
+; SSE-NEXT: pandn %xmm11, %xmm9
+; SSE-NEXT: por %xmm8, %xmm9
+; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pandn %xmm1, %xmm2
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: pand %xmm15, %xmm2
+; SSE-NEXT: pandn %xmm11, %xmm15
+; SSE-NEXT: por %xmm2, %xmm15
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: pand %xmm12, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1]
; SSE-NEXT: pandn %xmm0, %xmm12
; SSE-NEXT: por %xmm1, %xmm12
-; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,1,2,2,4,5,6,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,2,2,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,1,3,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,1]
-; SSE-NEXT: pand %xmm13, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm13
-; SSE-NEXT: por %xmm3, %xmm13
-; SSE-NEXT: pand %xmm11, %xmm13
-; SSE-NEXT: pandn %xmm0, %xmm11
-; SSE-NEXT: por %xmm13, %xmm11
-; SSE-NEXT: movdqa %xmm11, (%r9)
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,3,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1]
+; SSE-NEXT: pand %xmm10, %xmm2
+; SSE-NEXT: pandn %xmm1, %xmm10
+; SSE-NEXT: por %xmm2, %xmm10
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535]
+; SSE-NEXT: pand %xmm1, %xmm10
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: por %xmm10, %xmm1
+; SSE-NEXT: movdqa %xmm1, (%r9)
; SSE-NEXT: movdqa %xmm12, 16(%r9)
-; SSE-NEXT: movdqa %xmm9, 48(%r9)
-; SSE-NEXT: movdqa %xmm14, 64(%r9)
-; SSE-NEXT: movdqa %xmm10, 80(%r9)
-; SSE-NEXT: movdqa %xmm15, 96(%r9)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 128(%r9)
+; SSE-NEXT: movdqa %xmm15, 48(%r9)
+; SSE-NEXT: movdqa %xmm9, 64(%r9)
+; SSE-NEXT: movdqa %xmm7, 80(%r9)
+; SSE-NEXT: movdqa %xmm13, 96(%r9)
+; SSE-NEXT: movdqa %xmm14, 128(%r9)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 144(%r9)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 32(%r9)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 112(%r9)
-; SSE-NEXT: popq %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: vf16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rcx), %xmm13
+; AVX1-NEXT: vmovdqa (%rcx), %xmm1
; AVX1-NEXT: vmovdqa 16(%rcx), %xmm7
-; AVX1-NEXT: vmovdqa (%rdx), %xmm9
-; AVX1-NEXT: vmovdqa 16(%rdx), %xmm3
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT: vmovdqa (%rdx), %xmm5
+; AVX1-NEXT: vmovdqa 16(%rdx), %xmm9
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535]
-; AVX1-NEXT: vandnps %ymm0, %ymm10, %ymm6
-; AVX1-NEXT: vmovdqa (%rdi), %xmm14
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm0
-; AVX1-NEXT: vmovdqa (%rsi), %xmm15
-; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
-; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm2
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm2[1]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT: vandps %ymm2, %ymm10, %ymm2
-; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535]
+; AVX1-NEXT: vandnps %ymm0, %ymm6, %ymm0
+; AVX1-NEXT: vmovdqa (%rdi), %xmm2
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm10
+; AVX1-NEXT: vmovdqa (%rsi), %xmm4
+; AVX1-NEXT: vmovdqa 16(%rsi), %xmm12
+; AVX1-NEXT: vpsrlq $48, %xmm12, %xmm3
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3
+; AVX1-NEXT: vandps %ymm6, %ymm3, %ymm3
+; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0
; AVX1-NEXT: vmovdqa 16(%r8), %xmm11
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm4[2],xmm8[3,4,5,6],xmm4[7]
-; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm2[1,2,3,4],xmm4[5],xmm2[6,7]
-; AVX1-NEXT: vpsrlq $48, %xmm15, %xmm2
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm14[1],xmm2[1]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,3,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5,7,6]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-NEXT: vandnps %ymm2, %ymm10, %ymm2
-; AVX1-NEXT: vandps %ymm5, %ymm10, %ymm5
-; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,6,6]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,2]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4,5,6],xmm4[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6,7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,3,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3,4,5,6],xmm8[7]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3,4],xmm8[5],xmm3[6,7]
+; AVX1-NEXT: vpsrlq $48, %xmm4, %xmm8
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm2[1],xmm8[1]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,3,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm14
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5,7,6]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,3,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,3,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX1-NEXT: vmovdqa (%r8), %xmm4
-; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0
-; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,1,2,2]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm14[4],xmm6[5,6,7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX1-NEXT: vandnps %ymm5, %ymm6, %ymm5
-; AVX1-NEXT: vandps %ymm6, %ymm3, %ymm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4,5,6],xmm6[7]
-; AVX1-NEXT: vorps %ymm5, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4],xmm6[5],xmm5[6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm3, 32(%r9)
-; AVX1-NEXT: vmovdqa %xmm5, 48(%r9)
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0
+; AVX1-NEXT: vandnps %ymm14, %ymm6, %ymm14
+; AVX1-NEXT: vandps %ymm6, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm0, %ymm14, %ymm6
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5,6,6]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm12
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1],xmm13[2],xmm10[3,4,5,6],xmm13[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
+; AVX1-NEXT: vandnps %ymm12, %ymm15, %ymm7
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm0, %ymm15, %ymm0
+; AVX1-NEXT: vorps %ymm7, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,1,1]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,6]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11
+; AVX1-NEXT: vmovdqa (%r8), %xmm12
+; AVX1-NEXT: vandnps %ymm0, %ymm15, %ymm0
+; AVX1-NEXT: vandps %ymm15, %ymm11, %ymm11
+; AVX1-NEXT: vorps %ymm0, %ymm11, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3,4,5],xmm13[6],xmm11[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6,7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
+; AVX1-NEXT: vandnps %ymm1, %ymm4, %ymm1
+; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm4[2],xmm6[3,4,5,6],xmm4[7]
+; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3,4],xmm4[5],xmm2[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[3],xmm1[4,5,6,7]
+; AVX1-NEXT: vmovdqa %xmm1, 32(%r9)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%r9)
; AVX1-NEXT: vmovdqa %xmm0, (%r9)
-; AVX1-NEXT: vmovdqa %xmm1, 16(%r9)
-; AVX1-NEXT: vmovdqa %xmm7, 96(%r9)
-; AVX1-NEXT: vmovdqa %xmm11, 112(%r9)
-; AVX1-NEXT: vmovdqa %xmm2, 64(%r9)
+; AVX1-NEXT: vmovdqa %xmm11, 16(%r9)
+; AVX1-NEXT: vmovdqa %xmm9, 96(%r9)
+; AVX1-NEXT: vmovdqa %xmm7, 112(%r9)
+; AVX1-NEXT: vmovdqa %xmm5, 64(%r9)
; AVX1-NEXT: vmovdqa %xmm10, 80(%r9)
-; AVX1-NEXT: vmovdqa %xmm8, 128(%r9)
+; AVX1-NEXT: vmovdqa %xmm3, 128(%r9)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 144(%r9)
; AVX1-NEXT: vzeroupper
@@ -844,83 +843,83 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
;
; AVX2-SLOW-LABEL: vf16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2
; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3
; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4
-; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm8
+; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1
; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6
; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm7
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
-; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm8
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,1,3]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
-; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm5, %ymm0
-; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm5
+; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm5, %ymm5
+; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm9
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm5, %ymm5
-; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,2,2,2]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2],xmm1[3],xmm6[4,5],xmm1[6],xmm6[7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
-; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,1]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm6
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[2,3,2,3,6,7,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,2,3,3,7,6,7,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2],ymm1[3,4],ymm7[5,6,7,8],ymm1[9],ymm7[10],ymm1[11,12],ymm7[13,14,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
-; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm1
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,2,5,5,5,6]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5],ymm1[6],ymm7[7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13],ymm1[14],ymm7[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,1,4,5,6,5]
+; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5
+; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm9
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm6
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,2,6,7,6,6]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,2,3,3,7,6,7,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3,4],ymm9[5,6,7,8],ymm8[9],ymm9[10],ymm8[11,12],ymm9[13,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
+; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm8
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3],ymm7[4],ymm10[5,6],ymm7[7],ymm10[8,9],ymm7[10],ymm10[11],ymm7[12],ymm10[13,14],ymm7[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
-; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1
-; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm7
+; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
+; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm9
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1
+; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[1,1,2,2]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
-; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[1,1,2,2]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%r9)
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%r9)
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9)
+; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%r9)
+; AVX2-SLOW-NEXT: vmovdqa %ymm7, 128(%r9)
; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9)
; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9)
; AVX2-SLOW-NEXT: vzeroupper
@@ -928,79 +927,79 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
;
; AVX2-FAST-LABEL: vf16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm9
-; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm10
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2
; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3
; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4
-; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8
+; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1
; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm5
; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,2,2,2]
-; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
-; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
-; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm6
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[2,3,2,3,6,7,6,7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,2,3,3,7,6,7,7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3,4],ymm2[5,6,7,8],ymm1[9],ymm2[10],ymm1[11,12],ymm2[13,14,15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
-; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
-; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,2,5,5,5,6]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,1,2,1,4,5,6,5]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3],ymm2[4],ymm7[5,6],ymm2[7],ymm7[8,9],ymm2[10],ymm7[11],ymm2[12],ymm7[13,14],ymm2[15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
-; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2]
+; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm9
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm5
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm1[0,1,1,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm5
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm8
+; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
+; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
+; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm7
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,2,3,3,7,6,7,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3,4],ymm9[5,6,7,8],ymm8[9],ymm9[10],ymm8[11,12],ymm9[13,14,15]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
+; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm8
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
+; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
+; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm9
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm9[1,1,2,2]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
-; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[1,1,2,2]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9)
-; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%r9)
-; AVX2-FAST-NEXT: vmovdqa %ymm0, 128(%r9)
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9)
+; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%r9)
+; AVX2-FAST-NEXT: vmovdqa %ymm7, 128(%r9)
; AVX2-FAST-NEXT: vmovdqa %ymm6, (%r9)
; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%r9)
; AVX2-FAST-NEXT: vzeroupper
@@ -1051,408 +1050,402 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
; SSE-LABEL: vf32:
; SSE: # %bb.0:
-; SSE-NEXT: subq $248, %rsp
-; SSE-NEXT: movdqa (%rdi), %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rdi), %xmm10
-; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa (%rsi), %xmm13
-; SSE-NEXT: movdqa 16(%rsi), %xmm9
-; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa (%rdx), %xmm14
-; SSE-NEXT: movdqa (%rcx), %xmm11
+; SSE-NEXT: subq $232, %rsp
+; SSE-NEXT: movdqa (%rdi), %xmm6
+; SSE-NEXT: movdqa 16(%rdi), %xmm11
+; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa (%rsi), %xmm8
+; SSE-NEXT: movdqa 16(%rsi), %xmm13
+; SSE-NEXT: movdqa (%rdx), %xmm2
+; SSE-NEXT: movdqa (%rcx), %xmm0
; SSE-NEXT: movdqa 16(%rcx), %xmm12
; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa (%r8), %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,0,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm15, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7]
+; SSE-NEXT: movdqa (%r8), %xmm15
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: pandn %xmm6, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
-; SSE-NEXT: pand %xmm15, %xmm3
+; SSE-NEXT: pand %xmm9, %xmm3
; SSE-NEXT: por %xmm1, %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535]
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: pandn %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,2,2]
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,65535,65535,0]
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm8, %xmm7
-; SSE-NEXT: pandn %xmm6, %xmm7
-; SSE-NEXT: por %xmm5, %xmm7
-; SSE-NEXT: pand %xmm1, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,2,2]
+; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,65535,65535,0]
+; SSE-NEXT: pand %xmm14, %xmm5
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm14, %xmm10
+; SSE-NEXT: pandn %xmm7, %xmm10
+; SSE-NEXT: por %xmm5, %xmm10
+; SSE-NEXT: pand %xmm1, %xmm10
+; SSE-NEXT: por %xmm4, %xmm10
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: pand %xmm5, %xmm10
+; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: pandn %xmm15, %xmm3
+; SSE-NEXT: por %xmm10, %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm9, %xmm4
+; SSE-NEXT: pandn %xmm11, %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
+; SSE-NEXT: pand %xmm9, %xmm7
; SSE-NEXT: por %xmm4, %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: pand %xmm6, %xmm7
-; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: pandn %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: por %xmm7, %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm15, %xmm4
-; SSE-NEXT: pandn %xmm10, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
-; SSE-NEXT: pand %xmm15, %xmm5
-; SSE-NEXT: por %xmm4, %xmm5
-; SSE-NEXT: movdqa %xmm1, %xmm7
-; SSE-NEXT: pandn %xmm5, %xmm7
+; SSE-NEXT: movdqa %xmm1, %xmm10
+; SSE-NEXT: pandn %xmm7, %xmm10
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm8, %xmm5
-; SSE-NEXT: pandn %xmm4, %xmm5
-; SSE-NEXT: movdqa 16(%rdx), %xmm10
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,2,2]
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: por %xmm0, %xmm5
-; SSE-NEXT: pand %xmm1, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: movdqa 16(%r8), %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm6, %xmm5
-; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: pandn %xmm2, %xmm0
-; SSE-NEXT: por %xmm5, %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 32(%rdi), %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm15, %xmm0
-; SSE-NEXT: pandn %xmm2, %xmm0
-; SSE-NEXT: movdqa 32(%rsi), %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
-; SSE-NEXT: pand %xmm15, %xmm5
-; SSE-NEXT: por %xmm0, %xmm5
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: pandn %xmm5, %xmm0
-; SSE-NEXT: movdqa 32(%rcx), %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm8, %xmm7
-; SSE-NEXT: pandn %xmm5, %xmm7
-; SSE-NEXT: movdqa 32(%rdx), %xmm12
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,2,2]
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: por %xmm5, %xmm7
+; SSE-NEXT: movdqa %xmm14, %xmm7
+; SSE-NEXT: pandn %xmm4, %xmm7
+; SSE-NEXT: movdqa 16(%rdx), %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,2,2]
+; SSE-NEXT: pand %xmm14, %xmm11
+; SSE-NEXT: por %xmm11, %xmm7
; SSE-NEXT: pand %xmm1, %xmm7
-; SSE-NEXT: por %xmm0, %xmm7
-; SSE-NEXT: pand %xmm6, %xmm7
-; SSE-NEXT: movdqa 32(%r8), %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: pandn %xmm2, %xmm0
-; SSE-NEXT: por %xmm7, %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 48(%rdi), %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: por %xmm10, %xmm7
+; SSE-NEXT: movdqa 16(%r8), %xmm10
+; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm5, %xmm7
+; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: pandn %xmm10, %xmm3
+; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 32(%rdi), %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm9, %xmm7
+; SSE-NEXT: pandn %xmm3, %xmm7
+; SSE-NEXT: movdqa 32(%rsi), %xmm3
+; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4]
+; SSE-NEXT: pand %xmm9, %xmm10
+; SSE-NEXT: por %xmm7, %xmm10
+; SSE-NEXT: movdqa %xmm1, %xmm7
+; SSE-NEXT: pandn %xmm10, %xmm7
+; SSE-NEXT: movdqa 32(%rcx), %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm14, %xmm11
+; SSE-NEXT: pandn %xmm10, %xmm11
+; SSE-NEXT: movdqa 32(%rdx), %xmm10
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,2,2]
+; SSE-NEXT: pand %xmm14, %xmm12
+; SSE-NEXT: por %xmm12, %xmm11
+; SSE-NEXT: pand %xmm1, %xmm11
+; SSE-NEXT: por %xmm7, %xmm11
+; SSE-NEXT: pand %xmm5, %xmm11
+; SSE-NEXT: movdqa 32(%r8), %xmm7
+; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: pandn %xmm7, %xmm3
+; SSE-NEXT: por %xmm11, %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 48(%rdi), %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm9, %xmm7
+; SSE-NEXT: pandn %xmm3, %xmm7
+; SSE-NEXT: movdqa 48(%rsi), %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4]
+; SSE-NEXT: pand %xmm9, %xmm11
+; SSE-NEXT: por %xmm7, %xmm11
+; SSE-NEXT: movdqa 48(%rcx), %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm14, %xmm12
+; SSE-NEXT: pandn %xmm7, %xmm12
+; SSE-NEXT: movdqa 48(%rdx), %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,2,2]
+; SSE-NEXT: pand %xmm14, %xmm7
+; SSE-NEXT: por %xmm7, %xmm12
+; SSE-NEXT: pand %xmm1, %xmm12
+; SSE-NEXT: pandn %xmm11, %xmm1
+; SSE-NEXT: por %xmm12, %xmm1
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: movdqa 48(%r8), %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: por %xmm1, %xmm5
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,1]
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,0,65535,65535,65535,0]
+; SSE-NEXT: movdqa %xmm3, %xmm11
+; SSE-NEXT: pandn %xmm7, %xmm11
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,3,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1]
+; SSE-NEXT: pand %xmm3, %xmm12
+; SSE-NEXT: por %xmm11, %xmm12
+; SSE-NEXT: pand %xmm9, %xmm12
+; SSE-NEXT: movdqa %xmm15, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm15[0,1,0,1]
+; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: pandn %xmm11, %xmm1
+; SSE-NEXT: por %xmm12, %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; SSE-NEXT: movdqa %xmm0, %xmm12
+; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535]
+; SSE-NEXT: movdqa %xmm7, %xmm1
+; SSE-NEXT: pandn %xmm11, %xmm1
+; SSE-NEXT: pand %xmm7, %xmm12
+; SSE-NEXT: por %xmm12, %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT: movdqa %xmm2, %xmm7
+; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm1, %xmm11
+; SSE-NEXT: pandn %xmm7, %xmm11
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,2]
+; SSE-NEXT: pand %xmm1, %xmm12
+; SSE-NEXT: por %xmm11, %xmm12
+; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535]
+; SSE-NEXT: pand %xmm15, %xmm12
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,2,3]
; SSE-NEXT: movdqa %xmm15, %xmm0
-; SSE-NEXT: pandn %xmm2, %xmm0
-; SSE-NEXT: movdqa 48(%rsi), %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
-; SSE-NEXT: pand %xmm15, %xmm5
-; SSE-NEXT: por %xmm0, %xmm5
-; SSE-NEXT: movdqa 48(%rcx), %xmm0
+; SSE-NEXT: pandn %xmm11, %xmm0
+; SSE-NEXT: por %xmm12, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm8, %xmm7
-; SSE-NEXT: pandn %xmm0, %xmm7
-; SSE-NEXT: movdqa 48(%rdx), %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: por %xmm0, %xmm7
-; SSE-NEXT: pand %xmm1, %xmm7
-; SSE-NEXT: pandn %xmm5, %xmm1
-; SSE-NEXT: por %xmm7, %xmm1
-; SSE-NEXT: pand %xmm6, %xmm1
-; SSE-NEXT: movdqa 48(%r8), %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pandn %xmm0, %xmm6
-; SSE-NEXT: por %xmm1, %xmm6
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: psrlq $48, %xmm8
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm8[1]
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: pandn %xmm6, %xmm8
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: por %xmm8, %xmm2
; SSE-NEXT: movdqa %xmm14, %xmm0
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,0,65535,65535,65535,0]
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pandn %xmm0, %xmm5
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,1,3,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1]
-; SSE-NEXT: pand %xmm1, %xmm7
-; SSE-NEXT: por %xmm5, %xmm7
-; SSE-NEXT: pand %xmm15, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
-; SSE-NEXT: movdqa %xmm15, %xmm4
-; SSE-NEXT: pandn %xmm2, %xmm4
-; SSE-NEXT: por %xmm7, %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE-NEXT: movdqa %xmm11, %xmm5
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,0,65535]
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: pandn %xmm2, %xmm0
-; SSE-NEXT: pand %xmm4, %xmm7
-; SSE-NEXT: por %xmm7, %xmm0
+; SSE-NEXT: pandn %xmm11, %xmm0
+; SSE-NEXT: pand %xmm14, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
-; SSE-NEXT: movdqa %xmm14, %xmm2
+; SSE-NEXT: movdqa %xmm4, %xmm2
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
+; SSE-NEXT: movdqa %xmm3, %xmm6
+; SSE-NEXT: pandn %xmm2, %xmm6
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,1,3,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,1]
+; SSE-NEXT: pand %xmm3, %xmm8
+; SSE-NEXT: por %xmm6, %xmm8
+; SSE-NEXT: pand %xmm9, %xmm8
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,1,0,1]
+; SSE-NEXT: movdqa %xmm9, %xmm7
+; SSE-NEXT: pandn %xmm6, %xmm7
+; SSE-NEXT: por %xmm8, %xmm7
+; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE-NEXT: movdqa %xmm5, %xmm8
+; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535]
+; SSE-NEXT: movdqa %xmm2, %xmm7
+; SSE-NEXT: pandn %xmm6, %xmm7
+; SSE-NEXT: pand %xmm2, %xmm8
+; SSE-NEXT: movdqa %xmm2, %xmm12
+; SSE-NEXT: por %xmm8, %xmm7
+; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE-NEXT: movdqa %xmm4, %xmm2
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm0, %xmm7
-; SSE-NEXT: pandn %xmm2, %xmm7
-; SSE-NEXT: movdqa %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm13, %xmm5
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pandn %xmm2, %xmm6
+; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2]
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: por %xmm7, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,65535,65535,65535,65535,0,65535,65535]
-; SSE-NEXT: pand %xmm13, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3]
-; SSE-NEXT: movdqa %xmm13, %xmm3
-; SSE-NEXT: pandn %xmm11, %xmm3
-; SSE-NEXT: por %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm5, %xmm2
-; SSE-NEXT: psrlq $48, %xmm2
-; SSE-NEXT: movdqa %xmm6, %xmm3
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,7,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
-; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: por %xmm2, %xmm6
-; SSE-NEXT: movdqa %xmm8, %xmm2
-; SSE-NEXT: pandn %xmm11, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm15, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[2,3,2,3]
+; SSE-NEXT: movdqa %xmm15, %xmm5
+; SSE-NEXT: pandn %xmm6, %xmm5
+; SSE-NEXT: por %xmm2, %xmm5
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: psrlq $48, %xmm13
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm13[1]
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pandn %xmm0, %xmm2
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm13
+; SSE-NEXT: por %xmm2, %xmm13
+; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: pandn %xmm6, %xmm0
+; SSE-NEXT: pand %xmm14, %xmm13
+; SSE-NEXT: por %xmm13, %xmm0
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm10, %xmm2
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
-; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: movdqa %xmm3, %xmm6
; SSE-NEXT: pandn %xmm2, %xmm6
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm9, %xmm2
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm4, %xmm2
+; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,3,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,1]
-; SSE-NEXT: pand %xmm1, %xmm4
-; SSE-NEXT: por %xmm6, %xmm4
-; SSE-NEXT: pand %xmm15, %xmm4
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,1,0,1]
-; SSE-NEXT: movdqa %xmm15, %xmm3
-; SSE-NEXT: pandn %xmm6, %xmm3
-; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; SSE-NEXT: movdqa %xmm7, %xmm4
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,0,65535]
-; SSE-NEXT: movdqa %xmm14, %xmm2
-; SSE-NEXT: pandn %xmm6, %xmm2
-; SSE-NEXT: pand %xmm14, %xmm4
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,1,3,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,1]
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: por %xmm6, %xmm0
+; SSE-NEXT: movdqa %xmm9, %xmm13
+; SSE-NEXT: pand %xmm9, %xmm0
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,1,0,1]
+; SSE-NEXT: pandn %xmm6, %xmm13
+; SSE-NEXT: por %xmm0, %xmm13
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT: movdqa %xmm8, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm12, %xmm11
+; SSE-NEXT: pandn %xmm6, %xmm11
+; SSE-NEXT: pand %xmm12, %xmm2
+; SSE-NEXT: por %xmm2, %xmm11
+; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
; SSE-NEXT: movdqa %xmm10, %xmm2
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm9, %xmm3
-; SSE-NEXT: movdqa %xmm9, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: pandn %xmm2, %xmm6
+; SSE-NEXT: movdqa %xmm4, %xmm2
; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2]
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: pand %xmm13, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3]
-; SSE-NEXT: movdqa %xmm13, %xmm6
-; SSE-NEXT: pandn %xmm4, %xmm6
-; SSE-NEXT: por %xmm2, %xmm6
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: pand %xmm15, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE-NEXT: movdqa %xmm15, %xmm7
+; SSE-NEXT: pandn %xmm0, %xmm7
+; SSE-NEXT: por %xmm2, %xmm7
; SSE-NEXT: movdqa %xmm5, %xmm2
; SSE-NEXT: psrlq $48, %xmm2
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,7,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
-; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: por %xmm2, %xmm6
-; SSE-NEXT: movdqa %xmm8, %xmm2
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1]
+; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pandn %xmm4, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm12, %xmm2
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pandn %xmm2, %xmm4
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm9, %xmm2
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,1,3,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1]
-; SSE-NEXT: pand %xmm1, %xmm6
-; SSE-NEXT: por %xmm4, %xmm6
-; SSE-NEXT: pand %xmm15, %xmm6
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,1]
-; SSE-NEXT: movdqa %xmm15, %xmm7
-; SSE-NEXT: pandn %xmm4, %xmm7
-; SSE-NEXT: por %xmm6, %xmm7
-; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; SSE-NEXT: movdqa %xmm14, %xmm11
-; SSE-NEXT: pandn %xmm4, %xmm11
-; SSE-NEXT: pand %xmm14, %xmm6
-; SSE-NEXT: por %xmm6, %xmm11
-; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
-; SSE-NEXT: movdqa %xmm12, %xmm4
-; SSE-NEXT: movdqa %xmm12, %xmm5
-; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; SSE-NEXT: movdqa %xmm0, %xmm6
-; SSE-NEXT: pandn %xmm4, %xmm6
-; SSE-NEXT: movdqa %xmm9, %xmm4
-; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,2]
-; SSE-NEXT: pand %xmm0, %xmm4
-; SSE-NEXT: por %xmm6, %xmm4
-; SSE-NEXT: pand %xmm13, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,3,2,3]
-; SSE-NEXT: movdqa %xmm13, %xmm12
-; SSE-NEXT: pandn %xmm6, %xmm12
-; SSE-NEXT: por %xmm4, %xmm12
-; SSE-NEXT: psrlq $48, %xmm3
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm3[1]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm9, %xmm4
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,7,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm8, %xmm9
-; SSE-NEXT: pandn %xmm6, %xmm9
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: por %xmm2, %xmm9
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm3, %xmm2
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm10, %xmm4
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,1,3,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1]
-; SSE-NEXT: pand %xmm1, %xmm6
-; SSE-NEXT: pandn %xmm2, %xmm1
-; SSE-NEXT: por %xmm6, %xmm1
-; SSE-NEXT: pand %xmm15, %xmm1
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm10
+; SSE-NEXT: por %xmm2, %xmm10
+; SSE-NEXT: movdqa %xmm14, %xmm9
+; SSE-NEXT: pandn %xmm0, %xmm9
+; SSE-NEXT: pand %xmm14, %xmm10
+; SSE-NEXT: por %xmm10, %xmm9
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm6, %xmm10
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1]
-; SSE-NEXT: pandn %xmm2, %xmm15
-; SSE-NEXT: por %xmm1, %xmm15
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,6,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE-NEXT: movdqa %xmm14, %xmm4
-; SSE-NEXT: movdqa %xmm3, %xmm6
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,1,3,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1]
+; SSE-NEXT: pand %xmm3, %xmm12
+; SSE-NEXT: pandn %xmm0, %xmm3
+; SSE-NEXT: por %xmm12, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,0,65535,65535,65535]
+; SSE-NEXT: pand %xmm12, %xmm3
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,1]
+; SSE-NEXT: pandn %xmm0, %xmm12
+; SSE-NEXT: por %xmm3, %xmm12
+; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: movdqa %xmm2, %xmm12
+; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535]
-; SSE-NEXT: pand %xmm3, %xmm4
-; SSE-NEXT: pandn %xmm2, %xmm3
-; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm6, %xmm2
-; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm14
-; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm10, %xmm1
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: pand %xmm13, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
-; SSE-NEXT: pandn %xmm2, %xmm13
-; SSE-NEXT: por %xmm1, %xmm13
-; SSE-NEXT: psrlq $48, %xmm7
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm7[1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,7,6]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pandn %xmm10, %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: pandn %xmm2, %xmm8
-; SSE-NEXT: por %xmm0, %xmm8
-; SSE-NEXT: movdqa %xmm8, 304(%r9)
-; SSE-NEXT: movdqa %xmm13, 288(%r9)
-; SSE-NEXT: movdqa %xmm3, 256(%r9)
-; SSE-NEXT: movdqa %xmm15, 240(%r9)
+; SSE-NEXT: pand %xmm3, %xmm10
+; SSE-NEXT: pandn %xmm0, %xmm3
+; SSE-NEXT: por %xmm10, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: movdqa %xmm12, %xmm3
+; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pandn %xmm0, %xmm3
+; SSE-NEXT: movdqa %xmm6, %xmm0
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: por %xmm3, %xmm0
+; SSE-NEXT: pand %xmm15, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
+; SSE-NEXT: pandn %xmm3, %xmm15
+; SSE-NEXT: por %xmm0, %xmm15
+; SSE-NEXT: psrlq $48, %xmm5
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,7,6]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: pandn %xmm6, %xmm1
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: pand %xmm14, %xmm1
+; SSE-NEXT: pandn %xmm3, %xmm14
+; SSE-NEXT: por %xmm1, %xmm14
+; SSE-NEXT: movdqa %xmm14, 304(%r9)
+; SSE-NEXT: movdqa %xmm15, 288(%r9)
+; SSE-NEXT: movdqa %xmm2, 256(%r9)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 240(%r9)
; SSE-NEXT: movdqa %xmm9, 224(%r9)
-; SSE-NEXT: movdqa %xmm12, 208(%r9)
+; SSE-NEXT: movdqa %xmm7, 208(%r9)
; SSE-NEXT: movdqa %xmm11, 176(%r9)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 160(%r9)
+; SSE-NEXT: movdqa %xmm13, 160(%r9)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 144(%r9)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 128(%r9)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 96(%r9)
-; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 80(%r9)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 64(%r9)
@@ -1470,118 +1463,118 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; SSE-NEXT: movaps %xmm0, 112(%r9)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 32(%r9)
-; SSE-NEXT: addq $248, %rsp
+; SSE-NEXT: addq $232, %rsp
; SSE-NEXT: retq
;
; AVX1-LABEL: vf32:
; AVX1: # %bb.0:
; AVX1-NEXT: subq $72, %rsp
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm11
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa 32(%rsi), %xmm15
-; AVX1-NEXT: vmovdqa 48(%rsi), %xmm5
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm9
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm5
+; AVX1-NEXT: vmovdqa 32(%rsi), %xmm10
+; AVX1-NEXT: vmovdqa 48(%rsi), %xmm6
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5,6,6]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX1-NEXT: vandnps %ymm0, %ymm12, %ymm1
-; AVX1-NEXT: vmovdqa 32(%rdx), %xmm9
+; AVX1-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
+; AVX1-NEXT: vandnps %ymm0, %ymm14, %ymm1
+; AVX1-NEXT: vmovdqa 32(%rdx), %xmm11
; AVX1-NEXT: vmovdqa 48(%rdx), %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
-; AVX1-NEXT: vmovdqa 32(%rcx), %xmm6
-; AVX1-NEXT: vmovdqa 48(%rcx), %xmm7
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6],xmm4[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT: vandps %ymm2, %ymm12, %ymm2
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm10
-; AVX1-NEXT: vmovdqa 48(%r8), %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,6]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4
-; AVX1-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535]
-; AVX1-NEXT: vandnps %ymm4, %ymm14, %ymm4
-; AVX1-NEXT: vpsrlq $48, %xmm5, %xmm2
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vmovdqa 32(%rcx), %xmm13
+; AVX1-NEXT: vmovdqa 48(%rcx), %xmm4
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3,4,5,6],xmm7[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2
; AVX1-NEXT: vandps %ymm2, %ymm14, %ymm2
-; AVX1-NEXT: vorps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7]
-; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,2,2]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm11[4],xmm4[5,6,7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
+; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vmovdqa 48(%r8), %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
+; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535]
+; AVX1-NEXT: vandnps %ymm7, %ymm12, %ymm7
+; AVX1-NEXT: vpsrlq $48, %xmm6, %xmm8
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm5[1],xmm8[1]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
-; AVX1-NEXT: vandps %ymm5, %ymm4, %ymm4
-; AVX1-NEXT: vorps %ymm2, %ymm4, %ymm5
-; AVX1-NEXT: vpsrlq $48, %xmm15, %xmm2
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm11[1],xmm2[1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,3,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5
+; AVX1-NEXT: vandps %ymm5, %ymm12, %ymm5
+; AVX1-NEXT: vorps %ymm7, %ymm5, %ymm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1,2,3,4],xmm6[5],xmm5[6,7]
+; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7]
+; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,2]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5,6,7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
+; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
+; AVX1-NEXT: vandnps %ymm5, %ymm8, %ymm5
+; AVX1-NEXT: vandps %ymm7, %ymm8, %ymm7
+; AVX1-NEXT: vorps %ymm5, %ymm7, %ymm5
+; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm7
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; AVX1-NEXT: vmovdqa 32(%r8), %xmm4
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm1[1],xmm10[2,3,4,5],xmm1[6],xmm10[7]
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2
-; AVX1-NEXT: vandps %ymm0, %ymm14, %ymm0
-; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm2[3],xmm6[4,5,6,7]
+; AVX1-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vandnps %ymm3, %ymm12, %ymm1
+; AVX1-NEXT: vandps %ymm0, %ymm12, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7]
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm3[2],xmm0[3,4,5,6],xmm3[7]
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4],xmm3[5],xmm1[6,7]
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm3[2],xmm0[3,4,5,6],xmm3[7]
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7]
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vmovdqa 16(%rdx), %xmm9
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3]
+; AVX1-NEXT: vmovdqa 16(%rdx), %xmm8
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,3,2,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,1]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6]
@@ -1590,8 +1583,8 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX1-NEXT: vmovdqa 16(%rcx), %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7]
; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vandnps %ymm0, %ymm12, %ymm0
-; AVX1-NEXT: vandps %ymm1, %ymm12, %ymm1
+; AVX1-NEXT: vandnps %ymm0, %ymm14, %ymm0
+; AVX1-NEXT: vandps %ymm1, %ymm14, %ymm1
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
@@ -1599,7 +1592,7 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
@@ -1612,112 +1605,110 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2
-; AVX1-NEXT: vandps %ymm4, %ymm14, %ymm4
+; AVX1-NEXT: vandnps %ymm2, %ymm12, %ymm2
+; AVX1-NEXT: vandps %ymm4, %ymm12, %ymm4
; AVX1-NEXT: vorps %ymm2, %ymm4, %ymm2
-; AVX1-NEXT: vmovdqa 16(%r8), %xmm8
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,3,2,3]
+; AVX1-NEXT: vmovdqa 16(%r8), %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm2[1,2,3,4],xmm4[5],xmm2[6,7]
; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6],xmm4[7]
; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa (%rdi), %xmm10
-; AVX1-NEXT: vmovdqa (%rsi), %xmm13
-; AVX1-NEXT: vpsrlq $48, %xmm13, %xmm2
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm10[1],xmm2[1]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,3,2,4,5,6,7]
+; AVX1-NEXT: vmovdqa (%rdi), %xmm15
+; AVX1-NEXT: vmovdqa (%rsi), %xmm9
+; AVX1-NEXT: vpsrlq $48, %xmm9, %xmm2
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm15[1],xmm2[1]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,3,2,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vmovdqa (%rdx), %xmm3
-; AVX1-NEXT: vmovdqa (%rcx), %xmm4
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,7,6]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,3,3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5
-; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2
-; AVX1-NEXT: vandps %ymm5, %ymm14, %ymm5
-; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm5
+; AVX1-NEXT: vmovdqa (%rdx), %xmm4
+; AVX1-NEXT: vmovdqa (%rcx), %xmm5
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,5,7,6]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,3,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,0,2,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11
+; AVX1-NEXT: vandnps %ymm2, %ymm12, %ymm2
+; AVX1-NEXT: vandps %ymm12, %ymm11, %ymm11
+; AVX1-NEXT: vorps %ymm2, %ymm11, %ymm12
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5,6,6]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,2,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,2,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm14
+; AVX1-NEXT: vandnps %ymm0, %ymm14, %ymm0
+; AVX1-NEXT: vandps %ymm1, %ymm14, %ymm1
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
+; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,3,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm6, %ymm14, %ymm6
-; AVX1-NEXT: vandps %ymm0, %ymm14, %ymm0
-; AVX1-NEXT: vorps %ymm6, %ymm0, %ymm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,3,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,1]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
+; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2
+; AVX1-NEXT: vandps %ymm6, %ymm14, %ymm6
+; AVX1-NEXT: vorps %ymm2, %ymm6, %ymm2
; AVX1-NEXT: vmovdqa (%r8), %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm7[4],xmm0[5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5,6,7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1,2,3],xmm7[4],xmm2[5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3,4,5],xmm7[6],xmm2[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm15[4],xmm5[5,6,7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX1-NEXT: vandnps %ymm3, %ymm7, %ymm3
-; AVX1-NEXT: vandps %ymm7, %ymm4, %ymm4
-; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3],xmm3[4,5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4],xmm4[5],xmm3[6,7]
-; AVX1-NEXT: vmovdqa %xmm3, 48(%r9)
+; AVX1-NEXT: vandnps %ymm4, %ymm7, %ymm4
+; AVX1-NEXT: vandps %ymm7, %ymm5, %ymm5
+; AVX1-NEXT: vorps %ymm4, %ymm5, %ymm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm5[2],xmm12[3,4,5,6],xmm5[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7]
+; AVX1-NEXT: vmovdqa %xmm4, 48(%r9)
; AVX1-NEXT: vmovdqa %xmm6, 32(%r9)
-; AVX1-NEXT: vmovdqa %xmm0, 16(%r9)
-; AVX1-NEXT: vmovdqa %xmm1, (%r9)
-; AVX1-NEXT: vmovdqa %xmm2, 112(%r9)
-; AVX1-NEXT: vmovdqa %xmm11, 96(%r9)
-; AVX1-NEXT: vmovdqa %xmm9, 80(%r9)
-; AVX1-NEXT: vmovdqa %xmm5, 64(%r9)
+; AVX1-NEXT: vmovdqa %xmm2, 16(%r9)
+; AVX1-NEXT: vmovdqa %xmm8, (%r9)
+; AVX1-NEXT: vmovdqa %xmm3, 112(%r9)
+; AVX1-NEXT: vmovdqa %xmm1, 96(%r9)
+; AVX1-NEXT: vmovdqa %xmm0, 80(%r9)
+; AVX1-NEXT: vmovdqa %xmm7, 64(%r9)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 144(%r9)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -1748,333 +1739,335 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
;
; AVX2-SLOW-LABEL: vf32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: subq $40, %rsp
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm8
+; AVX2-SLOW-NEXT: subq $72, %rsp
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3
-; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm13
-; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm10
+; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1
+; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6
; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7
-; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm11
-; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm6
-; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10
+; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
+; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12
+; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,2,1,3]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,5,6]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
-; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm1
+; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm4, %ymm4
+; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm13
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm1
-; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
-; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm0
-; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm1
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
-; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm6
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,2,2,2]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2],xmm1[3],xmm6[4,5],xmm1[6],xmm6[7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm13, %ymm4
+; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
+; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm5
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
+; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm5, %ymm5
+; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm11
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm11, %ymm4
+; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12
+; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm13
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3],xmm10[4,5],xmm6[6],xmm10[7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
-; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,1,1]
+; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm6, %ymm6
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,1,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,3,2,3,6,7,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm1
+; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,2,6,7,6,6]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm10
-; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm1
-; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm5
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5],xmm5[6],xmm1[7]
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6
-; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm15
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,2]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7]
+; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9
+; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5],xmm11[6],xmm9[7]
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11
+; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm8
+; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7]
; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm4
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[3,2,3,3,7,6,7,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3,4],ymm7[5,6,7,8],ymm5[9],ymm7[10],ymm5[11,12],ymm7[13,14,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
-; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,1,1]
-; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm5, %ymm1
-; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,2,3,6,7,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,2,6,7,6,6]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7,8],ymm5[9],ymm9[10],ymm5[11],ymm9[12,13],ymm5[14],ymm9[15]
-; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
-; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm14
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[3,2,3,3,7,6,7,7]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2],ymm11[3,4],ymm9[5,6,7,8],ymm11[9],ymm9[10],ymm11[11,12],ymm9[13,14,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,2]
+; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,2]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm8
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[3,2,3,3,7,6,7,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2],ymm9[3,4],ymm14[5,6,7,8],ymm9[9],ymm14[10],ymm9[11,12],ymm14[13,14,15]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2]
-; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm9, %ymm7
-; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm5, %ymm5
-; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm0
-; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm7
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
-; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm15, %ymm9
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,1,1,2,5,5,5,6]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,1,2,1,4,5,6,5]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3],ymm13[4],ymm11[5,6],ymm13[7],ymm11[8,9],ymm13[10],ymm11[11],ymm13[12],ymm11[13,14],ymm13[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm9
-; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[1,1,1,2,5,5,5,6]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5],ymm11[6],ymm0[7,8],ymm11[9],ymm0[10,11],ymm11[12],ymm0[13],ymm11[14],ymm0[15]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,2,1,4,5,6,5]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3],ymm8[4],ymm11[5,6],ymm8[7],ymm11[8,9],ymm8[10],ymm11[11],ymm8[12],ymm11[13,14],ymm8[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0
-; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm8
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm9, %ymm8, %ymm8
-; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm9
-; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
-; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm11
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,0,3,0,7,4,7,4]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm15, %ymm9, %ymm9
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,1,1]
+; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm14, %ymm4
+; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm8
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,2,6,7,6,6]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15]
+; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm7
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,2,3,3,7,6,7,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2],ymm4[3,4],ymm15[5,6,7,8],ymm4[9],ymm15[10],ymm4[11,12],ymm15[13,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2]
+; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm4, %ymm1
+; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm4
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm4, %ymm9
+; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm4
+; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm12
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm4
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,1,2,5,5,5,6]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5],ymm15[6],ymm4[7,8],ymm15[9],ymm4[10,11],ymm15[12],ymm4[13],ymm15[14],ymm4[15]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,2,1,4,5,6,5]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
+; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,2,5,5,5,6]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,1,4,5,6,5]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm5, %ymm1
+; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm5
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
+; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm5
+; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm6
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,0,3,0,7,4,7,4]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4,5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10],ymm6[11],ymm11[12,13],ymm6[14],ymm11[15]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm3
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm6
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[3,0,3,0,7,4,7,4]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13],ymm6[14],ymm9[15]
-; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm9
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[1,1,2,2]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
-; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm6, %ymm9, %ymm3
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,2,2]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2
-; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2]
-; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
-; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9)
+; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm5
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,0,3,0,7,4,7,4]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15]
+; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm5
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
+; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0
+; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2
+; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2]
+; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9)
; AVX2-SLOW-NEXT: vmovdqa %ymm2, 224(%r9)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%r9)
-; AVX2-SLOW-NEXT: vmovdqa %ymm7, 128(%r9)
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9)
+; AVX2-SLOW-NEXT: vmovdqa %ymm12, 128(%r9)
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9)
-; AVX2-SLOW-NEXT: vmovdqa %ymm5, 288(%r9)
-; AVX2-SLOW-NEXT: vmovdqa %ymm8, 256(%r9)
+; AVX2-SLOW-NEXT: vmovdqa %ymm9, 288(%r9)
+; AVX2-SLOW-NEXT: vmovdqa %ymm4, 256(%r9)
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9)
-; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9)
; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9)
+; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r9)
-; AVX2-SLOW-NEXT: addq $40, %rsp
+; AVX2-SLOW-NEXT: addq $72, %rsp
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: vf32:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: subq $40, %rsp
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14
-; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm12
-; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm11
-; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8
-; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4
-; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm13
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
-; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1
+; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm8
+; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm4
; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm6
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm5[0,1,0,1]
-; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5
-; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
-; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1
-; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7
-; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm0
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,2,2,2]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
-; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm9, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,1,1,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm9, %ymm1
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
+; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm11
+; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm10
+; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12
+; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm13
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm10, %ymm4
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,1,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm10, %ymm1
; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm1
-; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm3
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
-; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm3
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,2,2,2]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
-; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm3
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0]
-; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,1]
-; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm5
+; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm10
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5],xmm10[6],xmm5[7]
+; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[1,2,2,2]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0],xmm10[1],xmm6[2],xmm10[3],xmm6[4,5],xmm10[6],xmm6[7]
+; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm1
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
-; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
-; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0]
+; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm10, %ymm5
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,1,1]
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm10, %ymm1
+; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10
+; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
+; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm7
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
+; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
+; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm9
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
+; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm9, %ymm7
+; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm9
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm9, %ymm1
+; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm9
+; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8
; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm2
-; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
-; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm2
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,3,2,3,6,7,6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
-; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm4
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,2,3,3,7,6,7,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3,4],ymm4[5,6,7,8],ymm6[9],ymm4[10],ymm6[11,12],ymm4[13,14,15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
-; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4
-; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15]
-; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm7
-; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm11
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
+; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm11, %ymm8
+; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm11
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm11, %ymm8
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm11
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm14
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[3,2,3,3,7,6,7,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2],ymm15[3,4],ymm14[5,6,7,8],ymm15[9],ymm14[10],ymm15[11,12],ymm14[13,14,15]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm14
+; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11
+; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm12
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7,8],ymm4[9],ymm12[10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
+; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm6
+; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm12
; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm13
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[3,2,3,3,7,6,7,7]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2],ymm15[3,4],ymm5[5,6,7,8],ymm15[9],ymm5[10],ymm15[11,12],ymm5[13,14,15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[3,2,3,3,7,6,7,7]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3,4],ymm12[5,6,7,8],ymm5[9],ymm12[10],ymm5[11,12],ymm12[13,14,15]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2]
-; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm5
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
-; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm5
-; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm2
-; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
-; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm6
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm5, %ymm12
+; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm5
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm14
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,1,2,5,5,5,6]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10,11],ymm15[12],ymm6[13],ymm15[14],ymm6[15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13],ymm15[14],ymm14[15]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u>
-; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm0
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,1,2,1,4,5,6,5]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3],ymm8[4],ymm0[5,6],ymm8[7],ymm0[8,9],ymm8[10],ymm0[11],ymm8[12],ymm0[13,14],ymm8[15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
-; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[1,1,1,2,5,5,5,6]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15]
-; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm6
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,1,2,1,4,5,6,5]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3],ymm15[4],ymm6[5,6],ymm15[7],ymm6[8,9],ymm15[10],ymm6[11],ymm15[12],ymm6[13,14],ymm15[15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
-; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm6, %ymm2
-; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm6
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0
-; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm6
-; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm6, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
-; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm8
+; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm1
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,2,1,4,5,6,5]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3],ymm7[4],ymm1[5,6],ymm7[7],ymm1[8,9],ymm7[10],ymm1[11],ymm7[12],ymm1[13,14],ymm7[15]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm14[2,3,2,3]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[1,1,1,2,5,5,5,6]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15]
+; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm7
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,2,1,4,5,6,5]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
+; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm5
+; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm7
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm7, %ymm15
+; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm7
+; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm5
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,0,3,0,7,4,7,4]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
-; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm10
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm12[1,1,2,2]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
-; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
-; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
+; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm9, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm1
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[3,0,3,0,7,4,7,4]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15]
-; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm14[1,1,2,2]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15]
-; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm4, %ymm4
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8],ymm1[9],ymm7[10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15]
+; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm7
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm0[1,1,2,2]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
+; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm7, %ymm1
+; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm3 = mem[1,1,2,2]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm8, %ymm3, %ymm3
-; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: # ymm7 = mem[1,1,2,2]
-; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm7, %ymm4
-; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%r9)
-; AVX2-FAST-NEXT: vmovdqa %ymm3, 224(%r9)
-; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r9)
-; AVX2-FAST-NEXT: vmovdqa %ymm1, 128(%r9)
-; AVX2-FAST-NEXT: vmovdqa %ymm5, 288(%r9)
-; AVX2-FAST-NEXT: vmovdqa %ymm0, 256(%r9)
-; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9)
+; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
+; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FAST-NEXT: # ymm3 = mem[1,1,2,2]
+; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1
+; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r9)
+; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%r9)
+; AVX2-FAST-NEXT: vmovdqa %ymm5, 96(%r9)
+; AVX2-FAST-NEXT: vmovdqa %ymm4, 128(%r9)
+; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%r9)
+; AVX2-FAST-NEXT: vmovdqa %ymm15, 256(%r9)
+; AVX2-FAST-NEXT: vmovdqa %ymm8, (%r9)
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9)
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index 77cec3c022828..0e9124808eb77 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -188,18 +188,18 @@ define void @vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecp
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm2[0],xmm1[0]
; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm5[0],xmm4[0]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm5[0],xmm4[0]
; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,1,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,1,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,2,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,0]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
@@ -208,7 +208,7 @@ define void @vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecp
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
; AVX1-NEXT: vmovdqa %xmm0, 32(%rax)
; AVX1-NEXT: vmovaps %ymm1, (%rax)
@@ -353,132 +353,132 @@ define void @vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecp
; SSE-LABEL: vf8:
; SSE: # %bb.0:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movdqa (%rdi), %xmm10
-; SSE-NEXT: movdqa (%rsi), %xmm0
+; SSE-NEXT: movdqa (%rdi), %xmm6
+; SSE-NEXT: movdqa (%rsi), %xmm8
; SSE-NEXT: movdqa (%rdx), %xmm2
-; SSE-NEXT: movdqa (%rcx), %xmm1
-; SSE-NEXT: movdqa (%r8), %xmm3
+; SSE-NEXT: movdqa (%rcx), %xmm9
+; SSE-NEXT: movdqa (%r8), %xmm7
; SSE-NEXT: movdqa (%r9), %xmm5
-; SSE-NEXT: movdqa %xmm2, %xmm9
-; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
-; SSE-NEXT: movdqa %xmm10, %xmm12
-; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
-; SSE-NEXT: movdqa %xmm12, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm9[3,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,1,3,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[0,1]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3]
-; SSE-NEXT: movaps {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,0]
-; SSE-NEXT: andps %xmm11, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
-; SSE-NEXT: movaps %xmm11, %xmm8
-; SSE-NEXT: andnps %xmm6, %xmm8
-; SSE-NEXT: orps %xmm4, %xmm8
-; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; SSE-NEXT: movdqa %xmm10, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
-; SSE-NEXT: andps %xmm11, %xmm0
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; SSE-NEXT: andnps %xmm1, %xmm11
-; SSE-NEXT: orps %xmm0, %xmm11
-; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm12[0]
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2]
-; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535]
-; SSE-NEXT: andps %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5]
-; SSE-NEXT: movaps %xmm4, %xmm1
-; SSE-NEXT: andnps %xmm6, %xmm1
-; SSE-NEXT: orps %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm10, %xmm0
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; SSE-NEXT: movdqa %xmm3, %xmm7
-; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[0,2]
-; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: andps %xmm6, %xmm7
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
+; SSE-NEXT: movdqa %xmm6, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSE-NEXT: movdqa %xmm3, %xmm10
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm1[3,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,3,3,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm0[0,1]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3]
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,0]
+; SSE-NEXT: andps %xmm0, %xmm10
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,1,2,1]
+; SSE-NEXT: movaps %xmm0, %xmm4
+; SSE-NEXT: andnps %xmm11, %xmm4
+; SSE-NEXT: orps %xmm10, %xmm4
+; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; SSE-NEXT: movdqa %xmm6, %xmm8
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm2[3,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,6,5,7,7]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm9[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3]
+; SSE-NEXT: andps %xmm0, %xmm8
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
+; SSE-NEXT: andnps %xmm9, %xmm0
+; SSE-NEXT: orps %xmm8, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm10
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm3[0]
+; SSE-NEXT: movdqa %xmm7, %xmm8
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[0,2]
+; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535]
+; SSE-NEXT: andps %xmm8, %xmm10
+; SSE-NEXT: movdqa %xmm5, %xmm11
+; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5]
+; SSE-NEXT: movaps %xmm8, %xmm9
+; SSE-NEXT: andnps %xmm11, %xmm9
+; SSE-NEXT: orps %xmm10, %xmm9
+; SSE-NEXT: movdqa %xmm6, %xmm10
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm2[1]
+; SSE-NEXT: movdqa %xmm7, %xmm12
+; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm10[0,2]
+; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: andps %xmm10, %xmm12
; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,3,3]
-; SSE-NEXT: movaps %xmm6, %xmm0
-; SSE-NEXT: andnps %xmm13, %xmm0
-; SSE-NEXT: orps %xmm7, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm10[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[1,1,1,1,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,1],xmm10[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
-; SSE-NEXT: andps %xmm4, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,1,1]
+; SSE-NEXT: movaps %xmm10, %xmm11
+; SSE-NEXT: andnps %xmm13, %xmm11
+; SSE-NEXT: orps %xmm12, %xmm11
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[1,1,1,1,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm6[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[0,2]
+; SSE-NEXT: andps %xmm8, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,1,1]
; SSE-NEXT: pslld $16, %xmm5
-; SSE-NEXT: andnps %xmm5, %xmm4
-; SSE-NEXT: orps %xmm2, %xmm4
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm9[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[0,2]
-; SSE-NEXT: andps %xmm6, %xmm7
-; SSE-NEXT: andnps %xmm3, %xmm6
-; SSE-NEXT: orps %xmm7, %xmm6
-; SSE-NEXT: movaps %xmm6, 16(%rax)
-; SSE-NEXT: movaps %xmm4, 48(%rax)
-; SSE-NEXT: movaps %xmm0, 64(%rax)
-; SSE-NEXT: movaps %xmm1, (%rax)
-; SSE-NEXT: movaps %xmm11, 80(%rax)
-; SSE-NEXT: movaps %xmm8, 32(%rax)
+; SSE-NEXT: andnps %xmm5, %xmm8
+; SSE-NEXT: orps %xmm2, %xmm8
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[0,2]
+; SSE-NEXT: andps %xmm10, %xmm12
+; SSE-NEXT: andnps %xmm6, %xmm10
+; SSE-NEXT: orps %xmm12, %xmm10
+; SSE-NEXT: movaps %xmm10, 16(%rax)
+; SSE-NEXT: movaps %xmm8, 48(%rax)
+; SSE-NEXT: movaps %xmm11, 64(%rax)
+; SSE-NEXT: movaps %xmm9, (%rax)
+; SSE-NEXT: movaps %xmm0, 80(%rax)
+; SSE-NEXT: movaps %xmm4, 32(%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: vf8:
; AVX1: # %bb.0:
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: vmovdqa (%rdi), %xmm8
-; AVX1-NEXT: vmovdqa (%rsi), %xmm9
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa (%rsi), %xmm1
; AVX1-NEXT: vmovdqa (%rdx), %xmm2
; AVX1-NEXT: vmovdqa (%rcx), %xmm3
-; AVX1-NEXT: vmovdqa (%r8), %xmm11
+; AVX1-NEXT: vmovdqa (%r8), %xmm4
; AVX1-NEXT: vmovdqa (%r9), %xmm5
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5],xmm0[6,7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm6[4,5],xmm8[6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5],xmm0[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm6[1],xmm5[1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3,4,5],xmm1[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm4[1],xmm1[1]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5],xmm1[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,2,3,3]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3]
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm0[1],xmm4[1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3,4,5],xmm5[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5],xmm2[6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vmovaps %ymm1, 64(%rax)
-; AVX1-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX1-NEXT: vmovaps %ymm10, (%rax)
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps %ymm0, 64(%rax)
+; AVX1-NEXT: vmovaps %ymm1, 32(%rax)
+; AVX1-NEXT: vmovaps %ymm8, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -648,173 +648,173 @@ define void @vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecp
define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
; SSE-LABEL: vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm11
-; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rdi), %xmm14
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 16(%rdi), %xmm11
; SSE-NEXT: movdqa (%rsi), %xmm15
-; SSE-NEXT: movdqa 16(%rsi), %xmm4
-; SSE-NEXT: movdqa (%rdx), %xmm8
-; SSE-NEXT: movdqa 16(%rdx), %xmm1
-; SSE-NEXT: movdqa (%rcx), %xmm10
-; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rcx), %xmm6
-; SSE-NEXT: movdqa 16(%r8), %xmm3
-; SSE-NEXT: movdqa 16(%r9), %xmm13
-; SSE-NEXT: movdqa %xmm1, %xmm7
-; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE-NEXT: movdqa %xmm14, %xmm9
-; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm7[3,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,7,7]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
-; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,0]
-; SSE-NEXT: andps %xmm12, %xmm0
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
-; SSE-NEXT: movaps %xmm12, %xmm2
-; SSE-NEXT: andnps %xmm5, %xmm2
-; SSE-NEXT: orps %xmm0, %xmm2
-; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
-; SSE-NEXT: movdqa %xmm14, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm5[0,1]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
-; SSE-NEXT: andps %xmm12, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
-; SSE-NEXT: movaps %xmm12, %xmm2
-; SSE-NEXT: andnps %xmm5, %xmm2
-; SSE-NEXT: orps %xmm0, %xmm2
-; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm8, %xmm4
-; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
-; SSE-NEXT: movdqa %xmm11, %xmm2
-; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE-NEXT: movdqa %xmm15, %xmm10
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm4[3,3]
-; SSE-NEXT: movdqa (%r8), %xmm11
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,6,5,7,7]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm5[2,3]
-; SSE-NEXT: movdqa (%r9), %xmm15
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
-; SSE-NEXT: movaps %xmm12, %xmm0
-; SSE-NEXT: andnps %xmm5, %xmm0
+; SSE-NEXT: movdqa 16(%rsi), %xmm6
+; SSE-NEXT: movdqa (%rdx), %xmm5
+; SSE-NEXT: movdqa 16(%rdx), %xmm0
+; SSE-NEXT: movdqa (%rcx), %xmm14
+; SSE-NEXT: movdqa 16(%rcx), %xmm8
+; SSE-NEXT: movdqa 16(%r8), %xmm9
+; SSE-NEXT: movdqa 16(%r9), %xmm10
+; SSE-NEXT: movdqa %xmm0, %xmm7
+; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; SSE-NEXT: movdqa %xmm11, %xmm3
+; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; SSE-NEXT: movdqa %xmm3, %xmm12
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm7[3,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,6,5,7,7]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm2[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3]
+; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,0]
+; SSE-NEXT: andps %xmm2, %xmm12
+; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
+; SSE-NEXT: movaps %xmm2, %xmm4
+; SSE-NEXT: andnps %xmm13, %xmm4
+; SSE-NEXT: orps %xmm12, %xmm4
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3]
+; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm11, %xmm6
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm0[3,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[2,1,3,3,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm8[0,1]
; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3]
-; SSE-NEXT: andps %xmm12, %xmm6
-; SSE-NEXT: orps %xmm6, %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
-; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm8[3,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[2,1,3,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm6[0,1]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3]
-; SSE-NEXT: andps %xmm12, %xmm5
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
-; SSE-NEXT: andnps %xmm6, %xmm12
-; SSE-NEXT: orps %xmm5, %xmm12
-; SSE-NEXT: movdqa %xmm9, %xmm6
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1]
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[0,2]
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,2,3,3]
-; SSE-NEXT: movdqa %xmm6, %xmm0
+; SSE-NEXT: andps %xmm2, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
+; SSE-NEXT: movaps %xmm2, %xmm4
+; SSE-NEXT: andnps %xmm8, %xmm4
+; SSE-NEXT: orps %xmm6, %xmm4
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm5, %xmm12
+; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; SSE-NEXT: movdqa %xmm1, %xmm13
+; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
+; SSE-NEXT: movdqa %xmm15, %xmm1
+; SSE-NEXT: movdqa %xmm13, %xmm15
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm12[3,3]
+; SSE-NEXT: movdqa (%r8), %xmm8
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,6,5,7,7]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm6[2,3]
+; SSE-NEXT: movdqa (%r9), %xmm6
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
+; SSE-NEXT: movaps %xmm2, %xmm11
+; SSE-NEXT: andnps %xmm4, %xmm11
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3]
+; SSE-NEXT: andps %xmm2, %xmm15
+; SSE-NEXT: orps %xmm15, %xmm11
+; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm5[3,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm8[2,1,3,3,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm14[0,1]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3]
+; SSE-NEXT: andps %xmm2, %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm6[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
+; SSE-NEXT: andnps %xmm14, %xmm2
+; SSE-NEXT: orps %xmm4, %xmm2
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1]
+; SSE-NEXT: movdqa %xmm9, %xmm11
+; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[0,2]
+; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,3,3]
+; SSE-NEXT: movdqa %xmm15, %xmm14
+; SSE-NEXT: pandn %xmm4, %xmm14
+; SSE-NEXT: andps %xmm15, %xmm11
+; SSE-NEXT: por %xmm11, %xmm14
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0]
+; SSE-NEXT: movdqa %xmm9, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[0,2]
+; SSE-NEXT: movdqa %xmm10, %xmm11
+; SSE-NEXT: pslld $16, %xmm11
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,65535,65535]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pandn %xmm11, %xmm4
+; SSE-NEXT: andps %xmm3, %xmm7
+; SSE-NEXT: por %xmm7, %xmm4
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movaps %xmm1, %xmm7
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[1,1,1,1,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm0[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm7[0,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1]
+; SSE-NEXT: movdqa %xmm15, %xmm7
+; SSE-NEXT: pandn %xmm2, %xmm7
+; SSE-NEXT: andps %xmm15, %xmm11
+; SSE-NEXT: por %xmm11, %xmm7
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[0,2]
+; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5]
+; SSE-NEXT: movdqa %xmm3, %xmm9
+; SSE-NEXT: pandn %xmm10, %xmm9
+; SSE-NEXT: andps %xmm3, %xmm0
+; SSE-NEXT: por %xmm0, %xmm9
+; SSE-NEXT: movdqa %xmm13, %xmm0
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1]
+; SSE-NEXT: movdqa %xmm8, %xmm1
+; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,2,3,3]
+; SSE-NEXT: movdqa %xmm15, %xmm0
; SSE-NEXT: pandn %xmm10, %xmm0
-; SSE-NEXT: andps %xmm6, %xmm5
-; SSE-NEXT: por %xmm5, %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm9[0]
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm9[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[0,2]
-; SSE-NEXT: movdqa %xmm13, %xmm5
-; SSE-NEXT: pslld $16, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm9, %xmm10
-; SSE-NEXT: pandn %xmm5, %xmm10
-; SSE-NEXT: andps %xmm9, %xmm7
-; SSE-NEXT: por %xmm7, %xmm10
-; SSE-NEXT: movdqa %xmm14, %xmm5
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm1[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,1,1]
-; SSE-NEXT: movdqa %xmm6, %xmm7
-; SSE-NEXT: pandn %xmm5, %xmm7
-; SSE-NEXT: andps %xmm6, %xmm0
-; SSE-NEXT: por %xmm0, %xmm7
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm14[0]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[0,2]
-; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5]
-; SSE-NEXT: movdqa %xmm9, %xmm3
-; SSE-NEXT: pandn %xmm13, %xmm3
-; SSE-NEXT: andps %xmm9, %xmm1
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm11, %xmm5
-; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[0,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3]
+; SSE-NEXT: andps %xmm15, %xmm1
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm13[0]
+; SSE-NEXT: movdqa %xmm8, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm13[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[0,2]
; SSE-NEXT: movdqa %xmm6, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: andps %xmm6, %xmm5
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE-NEXT: movdqa %xmm11, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1],xmm2[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2]
-; SSE-NEXT: movdqa %xmm15, %xmm0
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: movdqa %xmm9, %xmm5
-; SSE-NEXT: pandn %xmm0, %xmm5
-; SSE-NEXT: andps %xmm9, %xmm4
-; SSE-NEXT: por %xmm4, %xmm5
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movaps %xmm2, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[1,1,1,1,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm8[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2]
-; SSE-NEXT: andps %xmm6, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1]
-; SSE-NEXT: pandn %xmm0, %xmm6
-; SSE-NEXT: por %xmm4, %xmm6
-; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[0,2]
-; SSE-NEXT: andps %xmm9, %xmm8
-; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5]
-; SSE-NEXT: pandn %xmm15, %xmm9
-; SSE-NEXT: por %xmm8, %xmm9
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm10
+; SSE-NEXT: pandn %xmm1, %xmm10
+; SSE-NEXT: andps %xmm3, %xmm12
+; SSE-NEXT: por %xmm12, %xmm10
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; SSE-NEXT: movaps %xmm12, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[1,1,1,1,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm5[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm1[0,2]
+; SSE-NEXT: andps %xmm15, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1]
+; SSE-NEXT: pandn %xmm1, %xmm15
+; SSE-NEXT: por %xmm11, %xmm15
+; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm12[0]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm12[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[0,2]
+; SSE-NEXT: andps %xmm3, %xmm5
+; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5]
+; SSE-NEXT: pandn %xmm6, %xmm3
+; SSE-NEXT: por %xmm5, %xmm3
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movdqa %xmm9, (%rax)
-; SSE-NEXT: movdqa %xmm6, 16(%rax)
-; SSE-NEXT: movdqa %xmm5, 48(%rax)
-; SSE-NEXT: movdqa %xmm1, 64(%rax)
-; SSE-NEXT: movdqa %xmm3, 96(%rax)
+; SSE-NEXT: movdqa %xmm3, (%rax)
+; SSE-NEXT: movdqa %xmm15, 16(%rax)
+; SSE-NEXT: movdqa %xmm10, 48(%rax)
+; SSE-NEXT: movdqa %xmm0, 64(%rax)
+; SSE-NEXT: movdqa %xmm9, 96(%rax)
; SSE-NEXT: movdqa %xmm7, 112(%rax)
-; SSE-NEXT: movdqa %xmm10, 144(%rax)
+; SSE-NEXT: movdqa %xmm4, 144(%rax)
+; SSE-NEXT: movdqa %xmm14, 160(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 160(%rax)
-; SSE-NEXT: movaps %xmm12, 32(%rax)
+; SSE-NEXT: movaps %xmm0, 32(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 80(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -825,139 +825,135 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
;
; AVX1-LABEL: vf16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rcx), %xmm8
+; AVX1-NEXT: vmovdqa (%rcx), %xmm2
; AVX1-NEXT: vmovdqa 16(%rcx), %xmm0
-; AVX1-NEXT: vmovdqa (%rdx), %xmm9
+; AVX1-NEXT: vmovdqa (%rdx), %xmm3
; AVX1-NEXT: vmovdqa 16(%rdx), %xmm1
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,0,1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vmovdqa (%rsi), %xmm15
-; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2
-; AVX1-NEXT: vmovdqa (%rdi), %xmm11
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,2,3,3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vmovdqa (%rsi), %xmm5
+; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4
+; AVX1-NEXT: vmovdqa (%rdi), %xmm6
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm7
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vmovdqa 16(%r8), %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7]
-; AVX1-NEXT: vmovdqa 16(%r9), %xmm7
-; AVX1-NEXT: vpslld $16, %xmm7, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm6[5],xmm0[6,7]
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX1-NEXT: vmovdqa 16(%r8), %xmm10
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4,5],xmm7[6,7]
+; AVX1-NEXT: vmovdqa 16(%r9), %xmm12
+; AVX1-NEXT: vpslld $16, %xmm12, %xmm11
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm11[5],xmm7[6,7]
+; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[2,1,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5,6],xmm7[7]
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5],xmm4[6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,4,6,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6],xmm4[7]
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,3,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
-; AVX1-NEXT: vmovdqa (%r8), %xmm2
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,3,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11
+; AVX1-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7]
+; AVX1-NEXT: vmovdqa (%r8), %xmm15
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,5,7,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3,4,5],xmm0[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm11
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm11[2,3,4,5],xmm0[6,7]
; AVX1-NEXT: vmovdqa (%r9), %xmm0
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5,6],xmm5[7]
-; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm6[0,1,2],xmm5[3],xmm6[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[1,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1,2],xmm6[3],xmm3[4,5,6,7]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1],xmm4[0],xmm5[3]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm8
-; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5,6],ymm6[7]
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm3
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm6[0,1],xmm2[0],xmm6[3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm4[1],xmm11[2,3,4,5,6],xmm4[7]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1,2],xmm14[3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5,6],ymm4[7]
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm10[0],xmm4[3]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3,4],xmm9[5],xmm4[6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm5
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7]
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0],xmm3[3]
; AVX1-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5],xmm4[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5],xmm3[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm2[4,5],xmm5[6,7]
-; AVX1-NEXT: vpslld $16, %xmm0, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm15[4,5],xmm2[6,7]
+; AVX1-NEXT: vpslld $16, %xmm0, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,1,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0],xmm1[1,2],xmm4[3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7]
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX1-NEXT: vmovdqa %xmm5, 48(%rax)
-; AVX1-NEXT: vmovdqa %xmm4, (%rax)
-; AVX1-NEXT: vmovdqa %xmm3, 16(%rax)
-; AVX1-NEXT: vmovdqa %xmm10, 96(%rax)
-; AVX1-NEXT: vmovdqa %xmm12, 112(%rax)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%rax)
+; AVX1-NEXT: vmovdqa %xmm3, (%rax)
+; AVX1-NEXT: vmovdqa %xmm5, 16(%rax)
+; AVX1-NEXT: vmovdqa %xmm9, 96(%rax)
+; AVX1-NEXT: vmovdqa %xmm8, 112(%rax)
; AVX1-NEXT: vmovdqa %xmm14, 64(%rax)
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vmovaps %xmm0, 80(%rax)
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vmovaps %xmm0, 160(%rax)
+; AVX1-NEXT: vmovdqa %xmm11, 80(%rax)
+; AVX1-NEXT: vmovdqa %xmm7, 160(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 176(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -969,113 +965,113 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
;
; AVX2-SLOW-LABEL: vf16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm8
-; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm11
-; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm15
-; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm9
-; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm10
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2
+; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3
+; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4
+; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1
; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm5
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm1
+; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm9
; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm7
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,1,2,1]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,5]
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
-; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm2
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,1,2,1]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,7,6,5]
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm8
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,1,2,1]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5]
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
+; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm9[1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7]
+; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm11
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm3
-; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[2,1,2,3,6,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[2,1,2,3,6,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9
+; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,1,2,3,6,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[2,1,2,3,6,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm12
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2],ymm14[3,4],ymm4[5],ymm14[6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm4[1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm14, %ymm3
-; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1]
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm14
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm11[4],ymm8[5],ymm11[5],ymm8[6],ymm11[6],ymm8[7],ymm11[7],ymm8[12],ymm11[12],ymm8[13],ymm11[13],ymm8[14],ymm11[14],ymm8[15],ymm11[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm15[4],ymm9[4],ymm15[5],ymm9[5],ymm15[6],ymm9[6],ymm15[7],ymm9[7],ymm15[12],ymm9[12],ymm15[13],ymm9[13],ymm15[14],ymm9[14],ymm15[15],ymm9[15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,3,3,5,6,7,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,3,2,3,6,7,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm3
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,0,2,2]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7]
-; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
+; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm9
+; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1]
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,2,3,3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,3,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm14, %ymm15, %ymm14
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3]
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm13, %ymm15, %ymm9
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,0,2,1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,0,2,2]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7]
+; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[8],ymm11[8],ymm8[9],ymm11[9],ymm8[10],ymm11[10],ymm8[11],ymm11[11]
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm15[0],ymm9[0],ymm15[1],ymm9[1],ymm15[2],ymm9[2],ymm15[3],ymm9[3],ymm15[8],ymm9[8],ymm15[9],ymm9[9],ymm15[10],ymm9[10],ymm15[11],ymm9[11]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,2,2,5,4,6,6]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax)
-; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm9, 160(%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax)
; AVX2-SLOW-NEXT: vmovdqa %ymm14, 64(%rax)
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax)
@@ -1086,108 +1082,108 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
;
; AVX2-FAST-ALL-LABEL: vf16:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm9
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm10
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm15
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm1
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm3
; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %ymm4
-; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm11
+; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm2
; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm5
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm5, %xmm1
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm7, %xmm5, %xmm8
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm6
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm6, %xmm0
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm7
-; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm0
-; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm1
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
-; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %xmm2
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm7, %xmm6, %xmm7
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm8
+; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm9
+; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm11 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm10, %ymm10
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm10
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7]
+; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %xmm11
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm10, %ymm12
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm9, %ymm3
-; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm13, %ymm7, %ymm12, %ymm7
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm12, %ymm1, %ymm14
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm12, %ymm0, %ymm12
+; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11]
; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm14 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11]
+; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %ymm12
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7]
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm13, %ymm3, %ymm14, %ymm3
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = <1,u,u,2,u,u,3,u>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm14, %ymm3
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7]
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5,6],ymm14[7]
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm3, %ymm14, %ymm14
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm4[4],ymm15[5],ymm4[5],ymm15[6],ymm4[6],ymm15[7],ymm4[7],ymm15[12],ymm4[12],ymm15[13],ymm4[13],ymm15[14],ymm4[14],ymm15[15],ymm4[15]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm7
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = <1,u,u,2,u,u,3,u>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm14, %ymm15, %ymm14
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7]
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm14, %ymm15, %ymm14
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = <5,u,u,6,u,u,7,u>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm13, %ymm3
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7]
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6],ymm13[7]
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm13 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm3, %ymm13, %ymm3
-; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2]
-; AVX2-FAST-ALL-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm7, %ymm0
+; AVX2-FAST-ALL-NEXT: vpermd %ymm15, %ymm13, %ymm13
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7]
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7]
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm13, %ymm15, %ymm7
+; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,0,2,2,1,0,2,2]
+; AVX2-FAST-ALL-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm8, %ymm9, %ymm8
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7]
-; AVX2-FAST-ALL-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm4[0],ymm15[1],ymm4[1],ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[8],ymm4[8],ymm15[9],ymm4[9],ymm15[10],ymm4[10],ymm15[11],ymm4[11]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6],ymm8[7]
+; AVX2-FAST-ALL-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm6, %ymm6
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm6, %ymm6
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
+; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,4,u,u,5,u,u,6>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1
+; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 96(%rax)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, 160(%rax)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%rax)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 96(%rax)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm7, 160(%rax)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%rax)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm14, 64(%rax)
; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 128(%rax)
@@ -1198,119 +1194,112 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
;
; AVX2-FAST-PERLANE-LABEL: vf16:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm15
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm10
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm11
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm12
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm9
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[2,1,3,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm6
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm8, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm14
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm8
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm14[0],ymm3[1],ymm14[1],ymm3[2],ymm14[2],ymm3[3],ymm14[3],ymm3[8],ymm14[8],ymm3[9],ymm14[9],ymm3[10],ymm14[10],ymm3[11],ymm14[11]
-; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm14 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm15 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm8
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm7
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,1]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7
+; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm10 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm9
+; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm11 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm10
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm11
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm12[0,0,2,1]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm13, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm14
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm13
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
+; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm14 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm15
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm14 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm14, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6],ymm12[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm13
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2],ymm8[3,4],ymm12[5],ymm8[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6],ymm12[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm12, %ymm8
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4],ymm14[5],ymm8[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5,6],ymm14[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm14, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7]
-; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11]
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6],ymm5[7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
+; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm6
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm6
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1
; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 160(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax)
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 64(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
@@ -1365,367 +1354,361 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
; SSE-LABEL: vf32:
; SSE: # %bb.0:
-; SSE-NEXT: subq $296, %rsp # imm = 0x128
-; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: subq $280, %rsp # imm = 0x118
+; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm13
; SSE-NEXT: movdqa (%rsi), %xmm2
-; SSE-NEXT: movdqa 16(%rsi), %xmm14
-; SSE-NEXT: movdqa (%rdx), %xmm1
-; SSE-NEXT: movdqa 16(%rdx), %xmm9
-; SSE-NEXT: movdqa (%rcx), %xmm7
-; SSE-NEXT: movdqa 16(%rcx), %xmm15
-; SSE-NEXT: movdqa (%r8), %xmm5
-; SSE-NEXT: movdqa (%r9), %xmm12
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, %xmm8
-; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; SSE-NEXT: movdqa %xmm8, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm4[3,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,3,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm4[0,1]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3]
-; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,0]
-; SSE-NEXT: andps %xmm10, %xmm3
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm12[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
-; SSE-NEXT: movaps %xmm10, %xmm4
-; SSE-NEXT: andnps %xmm6, %xmm4
-; SSE-NEXT: orps %xmm3, %xmm4
-; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: movdqa 16(%rsi), %xmm9
+; SSE-NEXT: movdqa (%rdx), %xmm12
+; SSE-NEXT: movdqa 16(%rdx), %xmm14
+; SSE-NEXT: movdqa (%rcx), %xmm4
+; SSE-NEXT: movdqa 16(%rcx), %xmm10
+; SSE-NEXT: movdqa (%r8), %xmm7
+; SSE-NEXT: movdqa (%r9), %xmm11
+; SSE-NEXT: movdqa %xmm12, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[3,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm3[2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm5
+; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; SSE-NEXT: movdqa %xmm5, %xmm8
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[3,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm3[0,1]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3]
+; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,0]
+; SSE-NEXT: andps %xmm6, %xmm8
+; SSE-NEXT: movdqa %xmm11, %xmm3
+; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
+; SSE-NEXT: movaps %xmm6, %xmm0
+; SSE-NEXT: andnps %xmm11, %xmm0
+; SSE-NEXT: orps %xmm8, %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm12[3,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,5,7,7]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm4[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3]
-; SSE-NEXT: andps %xmm10, %xmm2
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; SSE-NEXT: movaps %xmm10, %xmm4
-; SSE-NEXT: andnps %xmm3, %xmm4
-; SSE-NEXT: orps %xmm2, %xmm4
-; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE-NEXT: andps %xmm6, %xmm2
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
+; SSE-NEXT: movaps %xmm6, %xmm0
+; SSE-NEXT: andnps %xmm4, %xmm0
+; SSE-NEXT: orps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm13, %xmm11
-; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; SSE-NEXT: movdqa %xmm11, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm0[3,3]
-; SSE-NEXT: movdqa 16(%r8), %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm6[0,1]
-; SSE-NEXT: movdqa 16(%r9), %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3]
+; SSE-NEXT: movdqa 16(%r8), %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
-; SSE-NEXT: movaps %xmm10, %xmm7
-; SSE-NEXT: andnps %xmm6, %xmm7
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3]
-; SSE-NEXT: andps %xmm10, %xmm3
-; SSE-NEXT: orps %xmm3, %xmm7
-; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm13, %xmm3
-; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm9[3,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,5,7,7]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm6[2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
-; SSE-NEXT: movaps %xmm10, %xmm0
-; SSE-NEXT: andnps %xmm6, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3]
-; SSE-NEXT: andps %xmm10, %xmm3
-; SSE-NEXT: orps %xmm3, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm8[0,1]
+; SSE-NEXT: movdqa 16(%r9), %xmm8
+; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
+; SSE-NEXT: movaps %xmm6, %xmm0
+; SSE-NEXT: andnps %xmm12, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3]
+; SSE-NEXT: andps %xmm6, %xmm11
+; SSE-NEXT: orps %xmm11, %xmm0
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 32(%rdx), %xmm2
-; SSE-NEXT: movdqa 32(%rcx), %xmm15
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7]
+; SSE-NEXT: movdqa %xmm14, (%rsp) # 16-byte Spill
+; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7]
+; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm13, %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm14[3,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,6,5,7,7]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm10[2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
+; SSE-NEXT: movaps %xmm6, %xmm0
+; SSE-NEXT: andnps %xmm10, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3]
+; SSE-NEXT: andps %xmm6, %xmm9
+; SSE-NEXT: orps %xmm9, %xmm0
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 32(%rdx), %xmm1
+; SSE-NEXT: movdqa 32(%rcx), %xmm9
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 32(%rdi), %xmm9
-; SSE-NEXT: movdqa 32(%rsi), %xmm14
-; SSE-NEXT: movdqa %xmm9, %xmm3
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm0[3,3]
-; SSE-NEXT: movdqa 32(%r8), %xmm6
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm0[0,1]
-; SSE-NEXT: movdqa 32(%r9), %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm7
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
-; SSE-NEXT: movaps %xmm10, %xmm1
-; SSE-NEXT: andnps %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3]
-; SSE-NEXT: andps %xmm10, %xmm3
-; SSE-NEXT: orps %xmm3, %xmm1
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE-NEXT: movdqa 32(%rdi), %xmm12
+; SSE-NEXT: movdqa 32(%rsi), %xmm10
+; SSE-NEXT: movdqa %xmm12, %xmm11
+; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; SSE-NEXT: movdqa %xmm11, %xmm15
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm0[3,3]
+; SSE-NEXT: movdqa 32(%r8), %xmm13
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm13[2,1,3,3,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[0,1]
+; SSE-NEXT: movdqa 32(%r9), %xmm2
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,6,5,7,7]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; SSE-NEXT: movaps %xmm10, %xmm1
-; SSE-NEXT: andnps %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE-NEXT: movaps %xmm6, %xmm14
+; SSE-NEXT: andnps %xmm0, %xmm14
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3]
+; SSE-NEXT: andps %xmm6, %xmm15
+; SSE-NEXT: orps %xmm15, %xmm14
+; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
+; SSE-NEXT: movdqa %xmm12, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,6,5,7,7]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm9[2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
+; SSE-NEXT: movaps %xmm6, %xmm10
+; SSE-NEXT: andnps %xmm9, %xmm10
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
-; SSE-NEXT: andps %xmm10, %xmm0
-; SSE-NEXT: orps %xmm0, %xmm1
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: andps %xmm6, %xmm0
+; SSE-NEXT: orps %xmm0, %xmm10
+; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 48(%rdx), %xmm0
-; SSE-NEXT: movdqa 48(%rcx), %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 48(%rcx), %xmm4
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 48(%rdi), %xmm2
-; SSE-NEXT: movdqa 48(%rsi), %xmm14
+; SSE-NEXT: movdqa 48(%rsi), %xmm9
; SSE-NEXT: movdqa %xmm2, %xmm15
-; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm1[3,3]
-; SSE-NEXT: movdqa 48(%r8), %xmm9
-; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[2,1,3,3,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm13[0,1]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[3,3]
+; SSE-NEXT: movdqa 48(%r8), %xmm10
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm10[2,1,3,3,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[0,1]
; SSE-NEXT: movdqa 48(%r9), %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm13
-; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
-; SSE-NEXT: movaps %xmm10, %xmm1
-; SSE-NEXT: andnps %xmm7, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
+; SSE-NEXT: movaps %xmm6, %xmm3
+; SSE-NEXT: andnps %xmm14, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3]
-; SSE-NEXT: andps %xmm10, %xmm15
-; SSE-NEXT: orps %xmm15, %xmm1
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
+; SSE-NEXT: andps %xmm6, %xmm15
+; SSE-NEXT: orps %xmm15, %xmm3
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[3,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,6,5,7,7]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm7[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[3,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,7,7]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm9[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3]
-; SSE-NEXT: andps %xmm10, %xmm2
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; SSE-NEXT: andnps %xmm7, %xmm10
-; SSE-NEXT: orps %xmm2, %xmm10
+; SSE-NEXT: andps %xmm6, %xmm2
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,6,6,7]
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
+; SSE-NEXT: andnps %xmm9, %xmm6
+; SSE-NEXT: orps %xmm2, %xmm6
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE-NEXT: movdqa %xmm7, %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[0,2]
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0]
-; SSE-NEXT: movdqa %xmm5, %xmm7
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[0,2]
-; SSE-NEXT: movdqa %xmm12, %xmm7
-; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5]
-; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm14, %xmm1
-; SSE-NEXT: pandn %xmm7, %xmm1
-; SSE-NEXT: andps %xmm14, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm14
+; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5]
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535]
+; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: pandn %xmm14, %xmm1
+; SSE-NEXT: andps %xmm9, %xmm0
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm2[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,1,1,1,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm8[0,2]
-; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,0,1,1]
-; SSE-NEXT: movdqa %xmm15, %xmm0
-; SSE-NEXT: pandn %xmm8, %xmm0
-; SSE-NEXT: andps %xmm15, %xmm7
-; SSE-NEXT: por %xmm7, %xmm0
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm7[1,1,1,1,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[0,2]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,1,1]
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm6, %xmm0
+; SSE-NEXT: andps %xmm5, %xmm14
+; SSE-NEXT: por %xmm14, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: movaps %xmm3, %xmm7
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE-NEXT: movaps %xmm15, %xmm6
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm5, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[1,3]
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[0,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3]
-; SSE-NEXT: pslld $16, %xmm12
-; SSE-NEXT: movdqa %xmm14, %xmm0
-; SSE-NEXT: pandn %xmm12, %xmm0
-; SSE-NEXT: andps %xmm14, %xmm7
-; SSE-NEXT: por %xmm7, %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[0,2]
-; SSE-NEXT: movdqa %xmm15, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
+; SSE-NEXT: movdqa %xmm7, %xmm14
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3]
+; SSE-NEXT: movaps %xmm0, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm14[0,2]
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,3,3]
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm9, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm0
-; SSE-NEXT: andps %xmm15, %xmm5
-; SSE-NEXT: por %xmm5, %xmm0
+; SSE-NEXT: andps %xmm9, %xmm6
+; SSE-NEXT: por %xmm6, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm11[0]
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5]
-; SSE-NEXT: movdqa %xmm14, %xmm3
-; SSE-NEXT: pandn %xmm5, %xmm3
-; SSE-NEXT: andps %xmm14, %xmm1
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm2[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,1,1,1,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[0,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: movdqa %xmm15, %xmm0
-; SSE-NEXT: pandn %xmm7, %xmm0
-; SSE-NEXT: andps %xmm15, %xmm1
-; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm15[1]
+; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm15[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2]
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: pandn %xmm14, %xmm0
+; SSE-NEXT: andps %xmm5, %xmm7
+; SSE-NEXT: por %xmm7, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE-NEXT: movaps %xmm5, %xmm1
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm4, %xmm7
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm0[1,3]
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[0,2]
-; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: movdqa %xmm14, %xmm7
-; SSE-NEXT: pandn %xmm0, %xmm7
-; SSE-NEXT: andps %xmm14, %xmm1
-; SSE-NEXT: por %xmm1, %xmm7
-; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1]
-; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[0,2]
-; SSE-NEXT: movdqa %xmm15, %xmm8
-; SSE-NEXT: pandn %xmm3, %xmm8
-; SSE-NEXT: andps %xmm15, %xmm4
-; SSE-NEXT: por %xmm4, %xmm8
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: movaps %xmm4, %xmm1
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT: movaps %xmm3, %xmm6
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm6, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3]
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2]
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[0,2]
+; SSE-NEXT: movdqa %xmm8, %xmm7
+; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5]
+; SSE-NEXT: movdqa %xmm9, %xmm1
+; SSE-NEXT: pandn %xmm7, %xmm1
+; SSE-NEXT: andps %xmm9, %xmm6
+; SSE-NEXT: por %xmm6, %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,1,1,1,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,1,1]
+; SSE-NEXT: movdqa %xmm5, %xmm1
+; SSE-NEXT: pandn %xmm7, %xmm1
+; SSE-NEXT: andps %xmm5, %xmm6
+; SSE-NEXT: por %xmm6, %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
+; SSE-NEXT: movaps %xmm2, %xmm7
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0]
+; SSE-NEXT: movaps %xmm0, %xmm6
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm1[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[0,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,2,3,3]
+; SSE-NEXT: pslld $16, %xmm8
+; SSE-NEXT: movdqa %xmm9, %xmm6
+; SSE-NEXT: pandn %xmm8, %xmm6
+; SSE-NEXT: andps %xmm9, %xmm7
+; SSE-NEXT: por %xmm7, %xmm6
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2]
+; SSE-NEXT: movdqa %xmm5, %xmm7
+; SSE-NEXT: pandn %xmm14, %xmm7
+; SSE-NEXT: andps %xmm5, %xmm0
+; SSE-NEXT: por %xmm0, %xmm7
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm11[0]
+; SSE-NEXT: movdqa %xmm13, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm11[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[0,2]
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5]
-; SSE-NEXT: movdqa %xmm14, %xmm7
-; SSE-NEXT: pandn %xmm5, %xmm7
-; SSE-NEXT: andps %xmm14, %xmm1
-; SSE-NEXT: por %xmm1, %xmm7
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[1,1,1,1,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[0,2]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: movdqa %xmm15, %xmm11
-; SSE-NEXT: pandn %xmm5, %xmm11
-; SSE-NEXT: andps %xmm15, %xmm1
-; SSE-NEXT: por %xmm1, %xmm11
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT: movaps %xmm4, %xmm5
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm6, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[1,3]
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[0,2]
-; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[2,2,3,3]
+; SSE-NEXT: movdqa %xmm0, %xmm14
+; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5]
+; SSE-NEXT: movdqa %xmm9, %xmm3
+; SSE-NEXT: pandn %xmm14, %xmm3
+; SSE-NEXT: andps %xmm9, %xmm8
+; SSE-NEXT: por %xmm8, %xmm3
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm1[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm13[1,1,1,1,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm1[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm11[0,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,1,1]
+; SSE-NEXT: movdqa %xmm5, %xmm8
+; SSE-NEXT: pandn %xmm11, %xmm8
+; SSE-NEXT: andps %xmm5, %xmm14
+; SSE-NEXT: por %xmm14, %xmm8
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT: movdqa %xmm1, %xmm14
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm12[0]
+; SSE-NEXT: movdqa %xmm13, %xmm11
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm12[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm11[0,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[2,2,3,3]
; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: movdqa %xmm14, %xmm3
-; SSE-NEXT: pandn %xmm0, %xmm3
-; SSE-NEXT: andps %xmm14, %xmm5
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
-; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,2]
-; SSE-NEXT: movdqa %xmm15, %xmm12
-; SSE-NEXT: pandn %xmm13, %xmm12
-; SSE-NEXT: andps %xmm15, %xmm6
-; SSE-NEXT: por %xmm6, %xmm12
+; SSE-NEXT: movdqa %xmm9, %xmm11
+; SSE-NEXT: pandn %xmm0, %xmm11
+; SSE-NEXT: andps %xmm9, %xmm14
+; SSE-NEXT: por %xmm14, %xmm11
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm1[1]
+; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm0[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm12[0,2]
+; SSE-NEXT: movdqa %xmm5, %xmm12
+; SSE-NEXT: pandn %xmm15, %xmm12
+; SSE-NEXT: andps %xmm5, %xmm13
+; SSE-NEXT: por %xmm13, %xmm12
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT: movaps %xmm1, %xmm5
+; SSE-NEXT: movaps %xmm1, %xmm14
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm9, %xmm6
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[0,2]
-; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload
-; SSE-NEXT: movdqa %xmm4, %xmm6
-; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5]
-; SSE-NEXT: movdqa %xmm14, %xmm13
-; SSE-NEXT: pandn %xmm6, %xmm13
-; SSE-NEXT: andps %xmm14, %xmm5
-; SSE-NEXT: por %xmm5, %xmm13
+; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0]
+; SSE-NEXT: movdqa %xmm10, %xmm13
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm13[0,2]
+; SSE-NEXT: movdqa %xmm4, %xmm15
+; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5]
+; SSE-NEXT: movdqa %xmm9, %xmm13
+; SSE-NEXT: pandn %xmm15, %xmm13
+; SSE-NEXT: andps %xmm9, %xmm14
+; SSE-NEXT: por %xmm14, %xmm13
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[1,1,1,1,4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[0,2]
+; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm10[1,1,1,1,4,5,6,7]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm1[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1]
-; SSE-NEXT: movdqa %xmm15, %xmm6
-; SSE-NEXT: pandn %xmm1, %xmm6
-; SSE-NEXT: andps %xmm15, %xmm5
-; SSE-NEXT: por %xmm5, %xmm6
+; SSE-NEXT: movdqa %xmm5, %xmm14
+; SSE-NEXT: pandn %xmm1, %xmm14
+; SSE-NEXT: andps %xmm5, %xmm15
+; SSE-NEXT: por %xmm15, %xmm14
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm9, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm0[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2]
-; SSE-NEXT: andps %xmm14, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,3,3]
+; SSE-NEXT: movdqa %xmm10, %xmm15
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,1],xmm0[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[0,2]
+; SSE-NEXT: andps %xmm9, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,3,3]
; SSE-NEXT: pslld $16, %xmm4
-; SSE-NEXT: pandn %xmm4, %xmm14
-; SSE-NEXT: por %xmm1, %xmm14
+; SSE-NEXT: pandn %xmm4, %xmm9
+; SSE-NEXT: por %xmm1, %xmm9
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm2[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2]
-; SSE-NEXT: andps %xmm15, %xmm9
-; SSE-NEXT: pandn %xmm5, %xmm15
-; SSE-NEXT: por %xmm9, %xmm15
+; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[0,2]
+; SSE-NEXT: andps %xmm5, %xmm10
+; SSE-NEXT: pandn %xmm15, %xmm5
+; SSE-NEXT: por %xmm10, %xmm5
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movdqa %xmm15, 352(%rax)
-; SSE-NEXT: movdqa %xmm14, 336(%rax)
-; SSE-NEXT: movdqa %xmm6, 304(%rax)
+; SSE-NEXT: movdqa %xmm5, 352(%rax)
+; SSE-NEXT: movdqa %xmm9, 336(%rax)
+; SSE-NEXT: movdqa %xmm14, 304(%rax)
; SSE-NEXT: movdqa %xmm13, 288(%rax)
; SSE-NEXT: movdqa %xmm12, 256(%rax)
-; SSE-NEXT: movdqa %xmm3, 240(%rax)
-; SSE-NEXT: movdqa %xmm11, 208(%rax)
-; SSE-NEXT: movdqa %xmm7, 192(%rax)
-; SSE-NEXT: movdqa %xmm8, 160(%rax)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 144(%rax)
+; SSE-NEXT: movdqa %xmm11, 240(%rax)
+; SSE-NEXT: movdqa %xmm8, 208(%rax)
+; SSE-NEXT: movdqa %xmm3, 192(%rax)
+; SSE-NEXT: movdqa %xmm7, 160(%rax)
+; SSE-NEXT: movdqa %xmm6, 144(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 112(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -1738,7 +1721,8 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; SSE-NEXT: movaps %xmm0, 16(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, (%rax)
-; SSE-NEXT: movaps %xmm10, 368(%rax)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 368(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 320(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -1753,154 +1737,152 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; SSE-NEXT: movaps %xmm0, 80(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 32(%rax)
-; SSE-NEXT: addq $296, %rsp # imm = 0x128
+; SSE-NEXT: addq $280, %rsp # imm = 0x118
; SSE-NEXT: retq
;
; AVX1-LABEL: vf32:
; AVX1: # %bb.0:
-; AVX1-NEXT: subq $184, %rsp
+; AVX1-NEXT: subq $120, %rsp
; AVX1-NEXT: vmovdqa 32(%rcx), %xmm8
; AVX1-NEXT: vmovdqa 48(%rcx), %xmm0
-; AVX1-NEXT: vmovdqa 32(%rdx), %xmm10
+; AVX1-NEXT: vmovdqa 32(%rdx), %xmm9
; AVX1-NEXT: vmovdqa 48(%rdx), %xmm1
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,2,3,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vmovdqa 32(%rsi), %xmm11
+; AVX1-NEXT: vmovdqa 32(%rsi), %xmm10
; AVX1-NEXT: vmovdqa 48(%rsi), %xmm2
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm6
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,3,2,3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7]
-; AVX1-NEXT: vmovdqa 48(%r8), %xmm6
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm7[1,2],xmm0[3]
-; AVX1-NEXT: vmovdqa 48(%r9), %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7]
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5],xmm3[6,7]
-; AVX1-NEXT: vpslld $16, %xmm0, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5],xmm3[6,7]
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7]
-; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,3,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
-; AVX1-NEXT: vmovdqa 32(%r8), %xmm7
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX1-NEXT: vmovdqa 32(%r9), %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,5,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5],xmm4[6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7]
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm4, %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm3[0,1],xmm6[0],xmm3[3]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm11
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
+; AVX1-NEXT: vmovdqa 48(%r8), %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
+; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm4[1,2],xmm2[3]
+; AVX1-NEXT: vmovdqa 48(%r9), %xmm2
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm12[1],xmm5[2,3,4,5,6],xmm12[7]
+; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7]
+; AVX1-NEXT: vpslld $16, %xmm2, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7]
; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm3[0,1],xmm7[0],xmm3[3]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7]
-; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,0,1,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,6,5,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5],xmm3[6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5,6],xmm3[7]
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
+; AVX1-NEXT: vmovdqa 32(%r8), %xmm1
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm4[1],xmm12[2,3]
+; AVX1-NEXT: vmovdqa 32(%r9), %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7]
+; AVX1-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm12
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,6,5,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,4,6,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7]
+; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm7
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = xmm6[0,1],xmm0[0],xmm6[3]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7]
+; AVX1-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3],xmm6[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5],xmm7[6,7]
+; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3],xmm2[4,5,6,7]
+; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovdqa 16(%rcx), %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm0[1,2],xmm3[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7]
-; AVX1-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6],xmm5[7]
+; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovdqa 16(%rdx), %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5],xmm0[6,7]
-; AVX1-NEXT: vpslld $16, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: vpslld $16, %xmm4, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
@@ -1935,17 +1917,16 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa (%rcx), %xmm14
-; AVX1-NEXT: vmovdqa (%rdx), %xmm11
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,2,2]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
+; AVX1-NEXT: vmovdqa (%rcx), %xmm9
+; AVX1-NEXT: vmovdqa (%rdx), %xmm8
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vmovdqa (%rsi), %xmm9
-; AVX1-NEXT: vmovdqa (%rdi), %xmm8
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
+; AVX1-NEXT: vmovdqa (%rsi), %xmm6
+; AVX1-NEXT: vmovdqa (%rdi), %xmm5
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
@@ -1953,61 +1934,58 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm0[1],xmm12[2,3]
; AVX1-NEXT: vmovdqa (%r9), %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1,2],xmm15[3],xmm13[4,5,6,7]
-; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,6,5,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,3,4,5],xmm7[6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0],xmm6[1],xmm7[2,3,4,5,6],xmm6[7]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,0,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm5[1,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm5[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm7
-; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = xmm6[0,1],xmm3[0],xmm6[3]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1,2,3,4],xmm5[5],xmm7[6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm5
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,6,5,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3,4,5],xmm14[6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,6,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5,6],xmm14[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm11
+; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10
+; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm10 = xmm11[0,1],xmm3[0],xmm11[3]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm14[5],xmm10[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm11
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3],xmm11[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm8
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6],ymm5[7]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0,1],xmm1[0],xmm5[3]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3,4],xmm2[5],xmm7[6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm8 = xmm6[0,1],xmm1[0],xmm6[3]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3],xmm6[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm3[1,2],xmm4[3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5,6],xmm6[7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7]
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7]
; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
@@ -2015,15 +1993,13 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-NEXT: vmovdqa %xmm0, 48(%rax)
; AVX1-NEXT: vmovdqa %xmm4, 32(%rax)
-; AVX1-NEXT: vmovdqa %xmm5, 16(%rax)
-; AVX1-NEXT: vmovdqa %xmm2, (%rax)
-; AVX1-NEXT: vmovdqa %xmm12, 112(%rax)
-; AVX1-NEXT: vmovdqa %xmm15, 96(%rax)
-; AVX1-NEXT: vmovdqa %xmm13, 80(%rax)
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vmovaps %xmm0, 64(%rax)
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vmovaps %xmm0, 176(%rax)
+; AVX1-NEXT: vmovdqa %xmm6, 16(%rax)
+; AVX1-NEXT: vmovdqa %xmm8, (%rax)
+; AVX1-NEXT: vmovdqa %xmm2, 112(%rax)
+; AVX1-NEXT: vmovdqa %xmm10, 96(%rax)
+; AVX1-NEXT: vmovdqa %xmm12, 80(%rax)
+; AVX1-NEXT: vmovdqa %xmm13, 64(%rax)
+; AVX1-NEXT: vmovdqa %xmm15, 176(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 160(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -2032,7 +2008,7 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX1-NEXT: vmovaps %xmm0, 128(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 240(%rax)
-; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 224(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 208(%rax)
@@ -2040,7 +2016,7 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX1-NEXT: vmovaps %xmm0, 192(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 304(%rax)
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 288(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 272(%rax)
@@ -2054,31 +2030,31 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX1-NEXT: vmovaps %xmm0, 336(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 320(%rax)
-; AVX1-NEXT: addq $184, %rsp
+; AVX1-NEXT: addq $120, %rsp
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vf32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: subq $648, %rsp # imm = 0x288
-; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0
-; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm9
+; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11
+; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5
; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7
+; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm7
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -2086,7 +2062,7 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm3
+; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm8
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
@@ -2095,42 +2071,43 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm11
-; AVX2-SLOW-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill
+; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm10
+; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,2,1]
-; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,2,1]
+; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1]
-; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
+; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2
; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,1,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm3
+; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm6
+; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm8
; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2
; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm9
+; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0
+; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1
@@ -2155,14 +2132,13 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1
-; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm14
; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm15
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,1,2,3,6,5,6,7]
@@ -2183,60 +2159,59 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7]
; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm13, %ymm0
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm5, %ymm12, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1]
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,3,3]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm13
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5,6],ymm13[7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm12
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6],ymm12[7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,3,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm13, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
-; AVX2-SLOW-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,3,3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7]
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,2,3,3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm5
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
-; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[2,3,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,3,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
-; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm10
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm5
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm7, %ymm5
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
@@ -2248,86 +2223,86 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
+; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm3
+; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm14[4],ymm5[4],ymm14[5],ymm5[5],ymm14[6],ymm5[6],ymm14[7],ymm5[7],ymm14[12],ymm5[12],ymm14[13],ymm5[13],ymm14[14],ymm5[14],ymm14[15],ymm5[15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm7
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7]
+; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm2, %ymm12
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,3,2,3,6,7,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm7, %ymm15
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm12, %ymm15
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-SLOW-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm7 # 16-byte Folded Reload
-; AVX2-SLOW-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX2-SLOW-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm12 # 16-byte Folded Reload
+; AVX2-SLOW-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,2]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7]
-; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX2-SLOW-NEXT: # xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7]
-; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX2-SLOW-NEXT: # xmm7 = mem[0,0,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm7, %ymm4
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,0,2,2]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6],ymm12[7]
+; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX2-SLOW-NEXT: # xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7]
+; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX2-SLOW-NEXT: # xmm12 = mem[0,0,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm4
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
-; AVX2-SLOW-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
+; AVX2-SLOW-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6],ymm0[7]
-; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX2-SLOW-NEXT: # xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,0,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm7, %ymm0
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11]
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,0,2,2,5,4,6,6]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2,3],ymm14[4],ymm7[5,6],ymm14[7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7]
+; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX2-SLOW-NEXT: # xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[0,0,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12
+; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,0,2,2,5,4,6,6]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm11, %ymm7
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
+; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[0],ymm6[1],mem[1],ymm6[2],mem[2],ymm6[3],mem[3],ymm6[8],mem[8],ymm6[9],mem[9],ymm6[10],mem[10],ymm6[11],mem[11]
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11]
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,0,2,2,5,4,6,6]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,2,5,4,6,6]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax)
; AVX2-SLOW-NEXT: vmovdqa %ymm15, 160(%rax)
-; AVX2-SLOW-NEXT: vmovdqa %ymm7, 288(%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm10, 288(%rax)
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovaps %ymm1, 352(%rax)
; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax)
@@ -2350,25 +2325,26 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
;
; AVX2-FAST-ALL-LABEL: vf32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: subq $648, %rsp # imm = 0x288
+; AVX2-FAST-ALL-NEXT: subq $616, %rsp # imm = 0x268
; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm1
; AVX2-FAST-ALL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %xmm0
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %xmm4
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-FAST-ALL-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm11
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm5
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm2
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %xmm4
-; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm3
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %xmm8
+; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, %xmm6
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm3
; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %xmm6
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %xmm9
; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm2, %ymm2
@@ -2383,50 +2359,52 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm8, %xmm0, %xmm1
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, %xmm13
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm8, %xmm11, %xmm0
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm4, %xmm1
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm4, %xmm7
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm5, %xmm0
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm5, %xmm10
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm4, %xmm14
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm8, %xmm13
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm9, %xmm15
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r8), %xmm3
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, %xmm8
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, %xmm11
; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r9), %xmm3
; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm3, %xmm1
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, %xmm7
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, %xmm8
; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %ymm1
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %ymm2
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm2, %ymm2
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm9
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm3, %ymm2
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm12
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %ymm9
-; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm14
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %ymm2
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
@@ -2441,7 +2419,7 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
; AVX2-FAST-ALL-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm2
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2451,11 +2429,11 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm2, %ymm0
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm2
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %ymm3
; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm5 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
@@ -2469,48 +2447,48 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vpshufb %ymm4, %ymm0, %ymm4
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm5, %ymm4, %ymm0
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm0
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7]
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <1,2,1,2,u,u,3,3>
; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm5, %ymm4
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm15 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm8, %xmm13
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm11, %xmm13
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5,6],ymm13[7]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm7, %xmm4
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm8, %xmm4
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm13, %ymm0, %ymm0
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm5, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
+; AVX2-FAST-ALL-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm5, %xmm5
+; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm2, %xmm5
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm14, %xmm1
+; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15]
+; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,5,6,5,6,7,7]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, %ymm11
; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15]
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
@@ -2519,16 +2497,17 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
-; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm6, %ymm5
+; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm8, %ymm5
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
+; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -2563,36 +2542,35 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
; AVX2-FAST-ALL-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm13, %ymm4
-; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FAST-ALL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: # xmm13 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX2-FAST-ALL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
+; AVX2-FAST-ALL-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5,6],ymm4[7]
-; AVX2-FAST-ALL-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm5, %ymm5
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,0,2,1,4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm5, %ymm5
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
-; AVX2-FAST-ALL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: # ymm5 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11]
+; AVX2-FAST-ALL-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; AVX2-FAST-ALL-NEXT: # xmm13 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm13, %ymm13
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2],ymm4[3,4],ymm13[5],ymm4[6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; AVX2-FAST-ALL-NEXT: # xmm13 = mem[0,0,2,1,4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm13, %ymm13
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm15, %ymm4, %ymm13, %ymm4
+; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11]
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = [5,4,2,2,5,4,6,6]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm14, %ymm5
+; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm14, %ymm12
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6],ymm5[7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm12 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2],ymm5[3,4],ymm12[5],ymm5[6,7]
-; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX2-FAST-ALL-NEXT: vpblendvb %ymm15, %ymm5, %ymm11, %ymm5
-; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7]
+; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm10 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
+; AVX2-FAST-ALL-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm10
; AVX2-FAST-ALL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
; AVX2-FAST-ALL-NEXT: # ymm6 = ymm6[0],mem[0],ymm6[1],mem[1],ymm6[2],mem[2],ymm6[3],mem[3],ymm6[8],mem[8],ymm6[9],mem[9],ymm6[10],mem[10],ymm6[11],mem[11]
; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm14, %ymm6
-; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
+; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -2604,7 +2582,7 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, 96(%rax)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 160(%rax)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, 288(%rax)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm10, 288(%rax)
; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 352(%rax)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, (%rax)
@@ -2621,22 +2599,22 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 224(%rax)
; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX2-FAST-ALL-NEXT: addq $648, %rsp # imm = 0x288
+; AVX2-FAST-ALL-NEXT: addq $616, %rsp # imm = 0x268
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX2-FAST-ALL-NEXT: retq
;
; AVX2-FAST-PERLANE-LABEL: vf32:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: subq $664, %rsp # imm = 0x298
+; AVX2-FAST-PERLANE-NEXT: subq $696, %rsp # imm = 0x2B8
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm12
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm2
@@ -2644,14 +2622,14 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7
; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm4
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8
; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
@@ -2660,18 +2638,19 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm2
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm12, %xmm2
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm9
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2685,22 +2664,21 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm15
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm12
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm15, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm7
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm8
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm15
+; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
@@ -2715,7 +2693,7 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2743,137 +2721,136 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm8
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm11, %ymm8, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm11, %ymm8, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm11
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,2,3,3]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm13
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5,6],ymm13[7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm13
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm13, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm11
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6],ymm11[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm14
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm14, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm14[2],ymm7[3,4],ymm14[5],ymm7[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm8
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm8
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm12[4],ymm15[4],ymm12[5],ymm15[5],ymm12[6],ymm15[6],ymm12[7],ymm15[7],ymm12[12],ymm15[12],ymm12[13],ymm15[13],ymm12[14],ymm15[14],ymm12[15],ymm15[15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm5
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm5, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4],ymm12[4],ymm0[5],ymm12[5],ymm0[6],ymm12[6],ymm0[7],ymm12[7],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm15[4],ymm3[5],ymm15[5],ymm3[6],ymm15[6],ymm3[7],ymm15[7],ymm3[12],ymm15[12],ymm3[13],ymm15[13],ymm3[14],ymm15[14],ymm3[15],ymm15[15]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm8, %ymm8
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6],ymm8[7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm11, %ymm15
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm5, %ymm15, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm14
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6],ymm14[7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm15, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm13
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm13, %ymm8
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm11, %ymm8, %ymm11
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm8, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6],ymm0[7]
-; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0,0,2,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm14
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6],ymm0[7]
-; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[0,0,2,1,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11]
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,0,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7]
+; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm11, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0,0,2,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm11, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm11, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm8
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3]
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7]
+; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[0,0,2,1,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm11, %ymm15, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm4[0],ymm12[0],ymm4[1],ymm12[1],ymm4[2],ymm12[2],ymm4[3],ymm12[3],ymm4[8],ymm12[8],ymm4[9],ymm12[9],ymm4[10],ymm12[10],ymm4[11],ymm12[11]
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3],ymm12[4],ymm7[5,6],ymm12[7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm12 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7]
; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm10, %ymm9, %ymm9
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
@@ -2881,30 +2858,32 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 160(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 288(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 352(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 320(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 288(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax)
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 192(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX2-FAST-PERLANE-NEXT: addq $664, %rsp # imm = 0x298
+; AVX2-FAST-PERLANE-NEXT: addq $696, %rsp # imm = 0x2B8
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
index e1f91214a3ab2..a0c196f33310c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
@@ -180,9 +180,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; SSE-NEXT: movaps (%rsi), %xmm4
; SSE-NEXT: movaps 16(%rsi), %xmm5
; SSE-NEXT: movaps 32(%rsi), %xmm6
-; SSE-NEXT: movaps 48(%rsi), %xmm8
-; SSE-NEXT: movaps %xmm0, %xmm7
-; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE-NEXT: movaps 48(%rsi), %xmm7
+; SSE-NEXT: movaps %xmm0, %xmm8
+; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
@@ -191,8 +191,8 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
; SSE-NEXT: movaps %xmm3, %xmm6
-; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
-; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
; SSE-NEXT: movaps %xmm3, 96(%rdx)
; SSE-NEXT: movaps %xmm6, 112(%rdx)
; SSE-NEXT: movaps %xmm2, 64(%rdx)
@@ -200,34 +200,34 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; SSE-NEXT: movaps %xmm1, 32(%rdx)
; SSE-NEXT: movaps %xmm4, 48(%rdx)
; SSE-NEXT: movaps %xmm0, (%rdx)
-; SSE-NEXT: movaps %xmm7, 16(%rdx)
+; SSE-NEXT: movaps %xmm8, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i32_stride2_vf16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps (%rsi), %xmm0
-; AVX1-NEXT: vmovaps 16(%rsi), %xmm8
+; AVX1-NEXT: vmovaps 16(%rsi), %xmm1
; AVX1-NEXT: vmovaps 32(%rsi), %xmm2
; AVX1-NEXT: vmovaps 48(%rsi), %xmm3
; AVX1-NEXT: vmovaps (%rdi), %xmm4
; AVX1-NEXT: vmovaps 16(%rdi), %xmm5
; AVX1-NEXT: vmovaps 32(%rdi), %xmm6
; AVX1-NEXT: vmovaps 48(%rdi), %xmm7
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm8 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vmovaps %ymm3, 32(%rdx)
-; AVX1-NEXT: vmovaps %ymm2, 96(%rdx)
-; AVX1-NEXT: vmovaps %ymm1, 64(%rdx)
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps %ymm1, 32(%rdx)
+; AVX1-NEXT: vmovaps %ymm3, 96(%rdx)
+; AVX1-NEXT: vmovaps %ymm2, 64(%rdx)
; AVX1-NEXT: vmovaps %ymm0, (%rdx)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -286,64 +286,62 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
; SSE-LABEL: store_i32_stride2_vf32:
; SSE: # %bb.0:
-; SSE-NEXT: movaps 112(%rdi), %xmm4
+; SSE-NEXT: movaps 112(%rdi), %xmm0
; SSE-NEXT: movaps 96(%rdi), %xmm6
-; SSE-NEXT: movaps 80(%rdi), %xmm8
-; SSE-NEXT: movaps 64(%rdi), %xmm9
-; SSE-NEXT: movaps (%rdi), %xmm11
-; SSE-NEXT: movaps 16(%rdi), %xmm14
-; SSE-NEXT: movaps 32(%rdi), %xmm15
+; SSE-NEXT: movaps 80(%rdi), %xmm4
+; SSE-NEXT: movaps 64(%rdi), %xmm3
+; SSE-NEXT: movaps (%rdi), %xmm8
+; SSE-NEXT: movaps 16(%rdi), %xmm1
+; SSE-NEXT: movaps 32(%rdi), %xmm2
; SSE-NEXT: movaps 48(%rdi), %xmm5
-; SSE-NEXT: movaps 96(%rsi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 96(%rsi), %xmm11
; SSE-NEXT: movaps 80(%rsi), %xmm12
; SSE-NEXT: movaps 64(%rsi), %xmm13
-; SSE-NEXT: movaps (%rsi), %xmm2
-; SSE-NEXT: movaps 16(%rsi), %xmm1
-; SSE-NEXT: movaps 32(%rsi), %xmm0
-; SSE-NEXT: movaps 48(%rsi), %xmm3
-; SSE-NEXT: movaps %xmm11, %xmm7
-; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; SSE-NEXT: movaps (%rsi), %xmm9
+; SSE-NEXT: movaps 16(%rsi), %xmm10
+; SSE-NEXT: movaps 32(%rsi), %xmm14
+; SSE-NEXT: movaps 48(%rsi), %xmm15
+; SSE-NEXT: movaps %xmm8, %xmm7
+; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1]
-; SSE-NEXT: movaps %xmm14, %xmm10
-; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
-; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
-; SSE-NEXT: movaps %xmm15, %xmm2
-; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
-; SSE-NEXT: movaps %xmm5, %xmm0
-; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE-NEXT: movaps %xmm9, %xmm1
-; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
-; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1]
-; SSE-NEXT: movaps %xmm8, %xmm13
+; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; SSE-NEXT: movaps %xmm1, %xmm9
+; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
+; SSE-NEXT: movaps %xmm2, %xmm10
+; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3]
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
+; SSE-NEXT: movaps %xmm5, %xmm14
+; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3]
+; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1]
+; SSE-NEXT: movaps %xmm3, %xmm15
+; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1]
+; SSE-NEXT: movaps %xmm4, %xmm13
; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1]
-; SSE-NEXT: movaps %xmm6, %xmm3
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; SSE-NEXT: movaps 112(%rsi), %xmm12
-; SSE-NEXT: movaps %xmm4, %xmm7
-; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
-; SSE-NEXT: movaps %xmm4, 224(%rdx)
+; SSE-NEXT: movaps %xmm6, %xmm12
+; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
+; SSE-NEXT: movaps 112(%rsi), %xmm11
+; SSE-NEXT: movaps %xmm0, %xmm7
+; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
+; SSE-NEXT: movaps %xmm0, 224(%rdx)
; SSE-NEXT: movaps %xmm7, 240(%rdx)
; SSE-NEXT: movaps %xmm6, 192(%rdx)
-; SSE-NEXT: movaps %xmm3, 208(%rdx)
-; SSE-NEXT: movaps %xmm8, 160(%rdx)
+; SSE-NEXT: movaps %xmm12, 208(%rdx)
+; SSE-NEXT: movaps %xmm4, 160(%rdx)
; SSE-NEXT: movaps %xmm13, 176(%rdx)
-; SSE-NEXT: movaps %xmm9, 128(%rdx)
-; SSE-NEXT: movaps %xmm1, 144(%rdx)
+; SSE-NEXT: movaps %xmm3, 128(%rdx)
+; SSE-NEXT: movaps %xmm15, 144(%rdx)
; SSE-NEXT: movaps %xmm5, 96(%rdx)
-; SSE-NEXT: movaps %xmm0, 112(%rdx)
-; SSE-NEXT: movaps %xmm15, 64(%rdx)
-; SSE-NEXT: movaps %xmm2, 80(%rdx)
-; SSE-NEXT: movaps %xmm14, 32(%rdx)
-; SSE-NEXT: movaps %xmm10, 48(%rdx)
-; SSE-NEXT: movaps %xmm11, (%rdx)
+; SSE-NEXT: movaps %xmm14, 112(%rdx)
+; SSE-NEXT: movaps %xmm2, 64(%rdx)
+; SSE-NEXT: movaps %xmm10, 80(%rdx)
+; SSE-NEXT: movaps %xmm1, 32(%rdx)
+; SSE-NEXT: movaps %xmm9, 48(%rdx)
+; SSE-NEXT: movaps %xmm8, (%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 16(%rdx)
; SSE-NEXT: retq
@@ -354,50 +352,50 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX1-NEXT: vmovaps 96(%rdi), %xmm1
; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovaps 64(%rsi), %xmm1
; AVX1-NEXT: vmovaps 64(%rdi), %xmm2
; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm9
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vmovaps 80(%rsi), %xmm2
; AVX1-NEXT: vmovaps 80(%rdi), %xmm3
; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm10
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: vmovaps (%rsi), %xmm3
; AVX1-NEXT: vmovaps 16(%rsi), %xmm4
-; AVX1-NEXT: vmovaps 32(%rsi), %xmm11
+; AVX1-NEXT: vmovaps 32(%rsi), %xmm5
; AVX1-NEXT: vmovaps 48(%rsi), %xmm6
; AVX1-NEXT: vmovaps (%rdi), %xmm7
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm0
-; AVX1-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX1-NEXT: vmovaps 48(%rdi), %xmm2
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm8
+; AVX1-NEXT: vmovaps 32(%rdi), %xmm9
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm10
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm11 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm8 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm11[2],xmm1[3],xmm11[3]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps 112(%rsi), %xmm4
-; AVX1-NEXT: vmovaps 112(%rdi), %xmm5
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX1-NEXT: vmovaps %ymm4, 224(%rdx)
-; AVX1-NEXT: vmovaps %ymm1, 64(%rdx)
-; AVX1-NEXT: vmovaps %ymm2, 96(%rdx)
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm6[2],xmm10[3],xmm6[3]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT: vmovaps 112(%rsi), %xmm7
+; AVX1-NEXT: vmovaps 112(%rdi), %xmm8
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm9 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7
+; AVX1-NEXT: vmovaps %ymm7, 224(%rdx)
+; AVX1-NEXT: vmovaps %ymm5, 64(%rdx)
+; AVX1-NEXT: vmovaps %ymm6, 96(%rdx)
; AVX1-NEXT: vmovaps %ymm3, (%rdx)
-; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
-; AVX1-NEXT: vmovaps %ymm10, 160(%rdx)
-; AVX1-NEXT: vmovaps %ymm9, 128(%rdx)
-; AVX1-NEXT: vmovaps %ymm8, 192(%rdx)
+; AVX1-NEXT: vmovaps %ymm4, 32(%rdx)
+; AVX1-NEXT: vmovaps %ymm2, 160(%rdx)
+; AVX1-NEXT: vmovaps %ymm1, 128(%rdx)
+; AVX1-NEXT: vmovaps %ymm0, 192(%rdx)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -407,50 +405,50 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX2-NEXT: vmovaps 64(%rdi), %xmm2
; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX2-NEXT: vmovaps 80(%rsi), %xmm3
; AVX2-NEXT: vmovaps 80(%rdi), %xmm4
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm10 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm11 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-NEXT: vmovaps (%rsi), %xmm4
; AVX2-NEXT: vmovaps 16(%rsi), %xmm5
; AVX2-NEXT: vmovaps 32(%rsi), %xmm6
; AVX2-NEXT: vmovaps 48(%rsi), %xmm7
-; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vmovaps 16(%rdi), %xmm1
-; AVX2-NEXT: vmovaps 32(%rdi), %xmm2
-; AVX2-NEXT: vmovaps 48(%rdi), %xmm3
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; AVX2-NEXT: vmovaps 112(%rsi), %xmm6
-; AVX2-NEXT: vmovaps 112(%rdi), %xmm1
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; AVX2-NEXT: vmovaps 96(%rsi), %xmm6
-; AVX2-NEXT: vmovaps 96(%rdi), %xmm0
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; AVX2-NEXT: vmovaps %xmm0, 192(%rdx)
-; AVX2-NEXT: vmovaps %xmm1, 208(%rdx)
-; AVX2-NEXT: vmovaps %xmm2, 224(%rdx)
-; AVX2-NEXT: vmovaps %xmm5, 240(%rdx)
-; AVX2-NEXT: vmovaps %xmm3, 64(%rdx)
-; AVX2-NEXT: vmovaps %xmm7, 80(%rdx)
-; AVX2-NEXT: vmovaps %xmm4, 96(%rdx)
+; AVX2-NEXT: vmovaps (%rdi), %xmm8
+; AVX2-NEXT: vmovaps 16(%rdi), %xmm9
+; AVX2-NEXT: vmovaps 32(%rdi), %xmm10
+; AVX2-NEXT: vmovaps 48(%rdi), %xmm11
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm11 = xmm10[2],xmm6[2],xmm10[3],xmm6[3]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
+; AVX2-NEXT: vmovaps 112(%rsi), %xmm10
+; AVX2-NEXT: vmovaps 112(%rdi), %xmm13
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm10[2],xmm13[3],xmm10[3]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1]
+; AVX2-NEXT: vmovaps 96(%rsi), %xmm13
+; AVX2-NEXT: vmovaps 96(%rdi), %xmm15
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
+; AVX2-NEXT: vmovaps %xmm13, 192(%rdx)
+; AVX2-NEXT: vmovaps %xmm0, 208(%rdx)
+; AVX2-NEXT: vmovaps %xmm10, 224(%rdx)
+; AVX2-NEXT: vmovaps %xmm14, 240(%rdx)
+; AVX2-NEXT: vmovaps %xmm6, 64(%rdx)
+; AVX2-NEXT: vmovaps %xmm11, 80(%rdx)
+; AVX2-NEXT: vmovaps %xmm7, 96(%rdx)
; AVX2-NEXT: vmovaps %xmm8, 112(%rdx)
-; AVX2-NEXT: vmovaps %xmm15, (%rdx)
-; AVX2-NEXT: vmovaps %xmm14, 16(%rdx)
-; AVX2-NEXT: vmovaps %xmm13, 32(%rdx)
+; AVX2-NEXT: vmovaps %xmm4, (%rdx)
+; AVX2-NEXT: vmovaps %xmm9, 16(%rdx)
+; AVX2-NEXT: vmovaps %xmm5, 32(%rdx)
; AVX2-NEXT: vmovaps %xmm12, 48(%rdx)
-; AVX2-NEXT: vmovaps %xmm11, 160(%rdx)
-; AVX2-NEXT: vmovaps %xmm10, 176(%rdx)
-; AVX2-NEXT: vmovaps %xmm9, 128(%rdx)
+; AVX2-NEXT: vmovaps %xmm3, 160(%rdx)
+; AVX2-NEXT: vmovaps %xmm2, 176(%rdx)
+; AVX2-NEXT: vmovaps %xmm1, 128(%rdx)
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-NEXT: vmovaps %xmm0, 144(%rdx)
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index 5f667aa00afd0..cb1fc60f8d1d9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -206,41 +206,41 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
; SSE-LABEL: store_i32_stride3_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm6
+; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps (%rsi), %xmm3
; SSE-NEXT: movaps 16(%rsi), %xmm5
-; SSE-NEXT: movaps (%rdx), %xmm8
-; SSE-NEXT: movaps 16(%rdx), %xmm9
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1]
+; SSE-NEXT: movaps (%rdx), %xmm2
+; SSE-NEXT: movaps 16(%rdx), %xmm4
+; SSE-NEXT: movaps %xmm1, %xmm6
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1]
; SSE-NEXT: movaps %xmm1, %xmm7
-; SSE-NEXT: movaps %xmm1, %xmm2
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSE-NEXT: movaps %xmm1, %xmm8
+; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm9[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm9[0,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,0]
-; SSE-NEXT: movaps %xmm6, %xmm4
-; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; SSE-NEXT: movaps %xmm6, %xmm7
-; SSE-NEXT: movaps %xmm6, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm8[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm9[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[0,3]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,0]
+; SSE-NEXT: movaps %xmm0, %xmm6
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
+; SSE-NEXT: movaps %xmm0, %xmm7
+; SSE-NEXT: movaps %xmm0, %xmm9
+; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[0,3]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm8[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3]
-; SSE-NEXT: movaps %xmm0, (%rcx)
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE-NEXT: movaps %xmm9, (%rcx)
; SSE-NEXT: movaps %xmm3, 16(%rcx)
-; SSE-NEXT: movaps %xmm2, 48(%rcx)
+; SSE-NEXT: movaps %xmm8, 48(%rcx)
; SSE-NEXT: movaps %xmm5, 64(%rcx)
-; SSE-NEXT: movaps %xmm6, 32(%rcx)
+; SSE-NEXT: movaps %xmm0, 32(%rcx)
; SSE-NEXT: movaps %xmm1, 80(%rcx)
; SSE-NEXT: retq
;
@@ -391,139 +391,140 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
; SSE-LABEL: store_i32_stride3_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm8
+; SSE-NEXT: movaps (%rdi), %xmm4
; SSE-NEXT: movaps 16(%rdi), %xmm2
-; SSE-NEXT: movaps 32(%rdi), %xmm5
-; SSE-NEXT: movaps 48(%rdi), %xmm15
-; SSE-NEXT: movaps (%rsi), %xmm11
-; SSE-NEXT: movaps 16(%rsi), %xmm12
-; SSE-NEXT: movaps 32(%rsi), %xmm14
-; SSE-NEXT: movaps 48(%rsi), %xmm4
-; SSE-NEXT: movaps 16(%rdx), %xmm10
+; SSE-NEXT: movaps 32(%rdi), %xmm1
+; SSE-NEXT: movaps 48(%rdi), %xmm8
+; SSE-NEXT: movaps (%rsi), %xmm5
+; SSE-NEXT: movaps 16(%rsi), %xmm9
+; SSE-NEXT: movaps 32(%rsi), %xmm10
+; SSE-NEXT: movaps 48(%rsi), %xmm11
+; SSE-NEXT: movaps 16(%rdx), %xmm0
; SSE-NEXT: movaps 32(%rdx), %xmm3
-; SSE-NEXT: movaps 48(%rdx), %xmm1
-; SSE-NEXT: movaps %xmm15, %xmm13
-; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1]
-; SSE-NEXT: movaps %xmm15, %xmm6
-; SSE-NEXT: movaps %xmm15, %xmm9
-; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm4[3,3]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm13[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[0,3]
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm6[2,0]
-; SSE-NEXT: movaps %xmm5, %xmm6
-; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm14[1]
-; SSE-NEXT: movaps %xmm5, %xmm1
-; SSE-NEXT: movaps %xmm5, %xmm13
-; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm14[3,3]
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm6[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,3]
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0]
-; SSE-NEXT: movaps %xmm2, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1]
-; SSE-NEXT: movaps %xmm2, %xmm6
-; SSE-NEXT: movaps %xmm2, %xmm7
-; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm12[3,3]
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm10[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm1[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm10[0,3]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,0]
-; SSE-NEXT: movaps %xmm8, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1]
+; SSE-NEXT: movaps 48(%rdx), %xmm7
+; SSE-NEXT: movaps %xmm8, %xmm12
+; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1]
+; SSE-NEXT: movaps %xmm8, %xmm13
; SSE-NEXT: movaps %xmm8, %xmm6
; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
-; SSE-NEXT: movaps %xmm8, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm11[3,3]
+; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm7[0,3]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,0]
+; SSE-NEXT: movaps %xmm1, %xmm12
+; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1]
+; SSE-NEXT: movaps %xmm1, %xmm13
+; SSE-NEXT: movaps %xmm1, %xmm14
+; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm10[3,3]
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm12[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm3[0,3]
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,0]
+; SSE-NEXT: movaps %xmm2, %xmm12
+; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1]
+; SSE-NEXT: movaps %xmm2, %xmm13
+; SSE-NEXT: movaps %xmm2, %xmm15
+; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3]
+; SSE-NEXT: movaps %xmm0, %xmm7
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm0[0,3]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,0]
+; SSE-NEXT: movaps %xmm4, %xmm12
+; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1]
+; SSE-NEXT: movaps %xmm4, %xmm13
+; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
+; SSE-NEXT: movaps %xmm4, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3]
; SSE-NEXT: movaps (%rdx), %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm0[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm12[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[0,3]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,0]
-; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; SSE-NEXT: # xmm15 = xmm15[1,2],mem[2,3]
-; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm10[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm0[2,3]
-; SSE-NEXT: movaps %xmm6, (%rcx)
-; SSE-NEXT: movaps %xmm11, 16(%rcx)
-; SSE-NEXT: movaps %xmm7, 48(%rcx)
-; SSE-NEXT: movaps %xmm12, 64(%rcx)
-; SSE-NEXT: movaps %xmm13, 96(%rcx)
-; SSE-NEXT: movaps %xmm14, 112(%rcx)
-; SSE-NEXT: movaps %xmm9, 144(%rcx)
-; SSE-NEXT: movaps %xmm4, 160(%rcx)
-; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3]
-; SSE-NEXT: movaps %xmm8, 32(%rcx)
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[2,0]
+; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: # xmm8 = xmm8[1,2],mem[2,3]
+; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm7[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm0[2,3]
+; SSE-NEXT: movaps %xmm13, (%rcx)
+; SSE-NEXT: movaps %xmm5, 16(%rcx)
+; SSE-NEXT: movaps %xmm15, 48(%rcx)
+; SSE-NEXT: movaps %xmm9, 64(%rcx)
+; SSE-NEXT: movaps %xmm14, 96(%rcx)
+; SSE-NEXT: movaps %xmm10, 112(%rcx)
+; SSE-NEXT: movaps %xmm6, 144(%rcx)
+; SSE-NEXT: movaps %xmm11, 160(%rcx)
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3]
+; SSE-NEXT: movaps %xmm4, 32(%rcx)
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3]
; SSE-NEXT: movaps %xmm2, 80(%rcx)
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3]
-; SSE-NEXT: movaps %xmm5, 128(%rcx)
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3]
-; SSE-NEXT: movaps %xmm15, 176(%rcx)
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
+; SSE-NEXT: movaps %xmm1, 128(%rcx)
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3]
+; SSE-NEXT: movaps %xmm8, 176(%rcx)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i32_stride3_vf16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps (%rdx), %ymm8
-; AVX1-NEXT: vmovapd 32(%rdx), %ymm9
+; AVX1-NEXT: vmovapd (%rdx), %ymm0
+; AVX1-NEXT: vmovapd 32(%rdx), %ymm1
; AVX1-NEXT: vmovaps (%rsi), %xmm2
; AVX1-NEXT: vmovaps 16(%rsi), %xmm3
-; AVX1-NEXT: vmovapd 32(%rsi), %xmm4
+; AVX1-NEXT: vmovaps 32(%rsi), %xmm4
; AVX1-NEXT: vmovaps 48(%rsi), %xmm5
; AVX1-NEXT: vmovaps (%rdi), %xmm6
; AVX1-NEXT: vmovaps 16(%rdi), %xmm7
-; AVX1-NEXT: vmovapd 32(%rdi), %xmm0
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm2[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1],xmm1[0,2]
+; AVX1-NEXT: vmovaps 32(%rdi), %xmm8
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm6[1],xmm2[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm9 = xmm2[1,1],xmm9[0,2]
; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vbroadcastsd (%rdx), %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX1-NEXT: vmovaps 48(%rdi), %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3],xmm5[3,3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm9[2,3,2,3]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,3,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6],ymm5[7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm4[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[1,1],xmm5[0,2]
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,0],xmm0[2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vbroadcastsd 32(%rdx), %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
-; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm7[3,3],xmm3[3,3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm3[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3,2,3]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = mem[0,0,3,3,4,4,7,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = mem[1,0,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm9[1,1,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = mem[0,0,3,3,4,4,7,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = mem[1,0,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm8[1,1,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
-; AVX1-NEXT: vmovaps %ymm5, 32(%rcx)
-; AVX1-NEXT: vmovaps %ymm4, 128(%rcx)
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastsd (%rdx), %ymm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7]
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm6
+; AVX1-NEXT: vshufps {{.*#+}} xmm9 = xmm6[3,3],xmm5[3,3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,2,3]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[0,0,3,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1],xmm6[0,2]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[2,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX1-NEXT: vbroadcastsd 32(%rdx), %ymm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
+; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm3[3,3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[0,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3,2,3]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[0,0,3,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7]
+; AVX1-NEXT: vmovaps %ymm0, 32(%rcx)
+; AVX1-NEXT: vmovaps %ymm1, 128(%rcx)
; AVX1-NEXT: vmovaps %ymm3, 64(%rcx)
-; AVX1-NEXT: vmovaps %ymm0, 96(%rcx)
-; AVX1-NEXT: vmovaps %ymm1, 160(%rcx)
+; AVX1-NEXT: vmovaps %ymm4, 96(%rcx)
+; AVX1-NEXT: vmovaps %ymm5, 160(%rcx)
; AVX1-NEXT: vmovaps %ymm2, (%rcx)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -719,99 +720,99 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
; SSE-LABEL: store_i32_stride3_vf32:
; SSE: # %bb.0:
-; SSE-NEXT: subq $168, %rsp
+; SSE-NEXT: subq $152, %rsp
; SSE-NEXT: movaps (%rdi), %xmm1
-; SSE-NEXT: movaps 16(%rdi), %xmm10
-; SSE-NEXT: movaps 32(%rdi), %xmm9
-; SSE-NEXT: movaps 48(%rdi), %xmm8
-; SSE-NEXT: movaps (%rsi), %xmm4
-; SSE-NEXT: movaps 16(%rsi), %xmm5
-; SSE-NEXT: movaps 32(%rsi), %xmm14
-; SSE-NEXT: movaps 48(%rsi), %xmm13
-; SSE-NEXT: movaps (%rdx), %xmm6
-; SSE-NEXT: movaps 16(%rdx), %xmm7
-; SSE-NEXT: movaps 32(%rdx), %xmm3
-; SSE-NEXT: movaps 48(%rdx), %xmm2
+; SSE-NEXT: movaps 16(%rdi), %xmm2
+; SSE-NEXT: movaps 32(%rdi), %xmm3
+; SSE-NEXT: movaps 48(%rdi), %xmm4
+; SSE-NEXT: movaps (%rsi), %xmm12
+; SSE-NEXT: movaps 16(%rsi), %xmm11
+; SSE-NEXT: movaps 32(%rsi), %xmm10
+; SSE-NEXT: movaps 48(%rsi), %xmm9
+; SSE-NEXT: movaps (%rdx), %xmm5
+; SSE-NEXT: movaps 16(%rdx), %xmm6
+; SSE-NEXT: movaps 32(%rdx), %xmm7
+; SSE-NEXT: movaps 48(%rdx), %xmm8
; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3]
-; SSE-NEXT: movaps %xmm6, %xmm11
-; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm1, %xmm6
-; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0]
-; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm1, %xmm6
-; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3]
+; SSE-NEXT: movaps %xmm5, %xmm14
+; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm1, %xmm5
+; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0]
+; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm1, %xmm13
+; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3]
; SSE-NEXT: movaps %xmm1, %xmm15
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm11[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[0,2]
-; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm10, %xmm0
-; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3]
-; SSE-NEXT: movaps %xmm10, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm14[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[0,2]
+; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3]
+; SSE-NEXT: movaps %xmm2, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm10, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1]
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm5[3,3]
-; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2]
-; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm9, %xmm0
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,3]
-; SSE-NEXT: movaps %xmm9, %xmm4
-; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0]
-; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm9, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1]
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm14[3,3]
-; SSE-NEXT: movaps %xmm9, (%rsp) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2]
-; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm8, %xmm0
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm11[3,3]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3]
-; SSE-NEXT: movaps %xmm8, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm6[1,1]
+; SSE-NEXT: movaps %xmm6, %xmm14
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2]
+; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm3, %xmm0
+; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3]
+; SSE-NEXT: movaps %xmm3, %xmm2
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm3, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm10[3,3]
+; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm7[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2]
+; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm4, %xmm0
+; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3]
+; SSE-NEXT: movaps %xmm4, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm8, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1]
-; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm13[3,3]
-; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm2[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2]
-; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 64(%rdi), %xmm13
+; SSE-NEXT: movaps %xmm4, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm9[3,3]
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm8[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2]
+; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 64(%rdi), %xmm9
; SSE-NEXT: movaps 64(%rdx), %xmm1
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm13, %xmm0
+; SSE-NEXT: movaps %xmm9, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
-; SSE-NEXT: movaps 64(%rsi), %xmm11
-; SSE-NEXT: movaps %xmm13, %xmm12
-; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0]
-; SSE-NEXT: movaps %xmm13, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1]
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm11[3,3]
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm1[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2]
+; SSE-NEXT: movaps 64(%rsi), %xmm12
+; SSE-NEXT: movaps %xmm9, %xmm13
+; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0]
+; SSE-NEXT: movaps %xmm9, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1]
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm12[3,3]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2]
; SSE-NEXT: movaps 80(%rdi), %xmm2
; SSE-NEXT: movaps 80(%rdx), %xmm3
; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,3]
; SSE-NEXT: movaps 80(%rsi), %xmm8
-; SSE-NEXT: movaps %xmm2, %xmm10
-; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0]
+; SSE-NEXT: movaps %xmm2, %xmm11
+; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0]
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3]
@@ -832,9 +833,9 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1]
; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm3[0,2]
; SSE-NEXT: movaps 112(%rdi), %xmm3
-; SSE-NEXT: movaps 112(%rdx), %xmm9
+; SSE-NEXT: movaps 112(%rdx), %xmm10
; SSE-NEXT: movaps %xmm3, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm9[0,3]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[0,3]
; SSE-NEXT: movaps 112(%rsi), %xmm1
; SSE-NEXT: movaps %xmm3, %xmm4
; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
@@ -842,14 +843,13 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: movaps %xmm3, %xmm5
; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm10[1,1]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
; SSE-NEXT: movaps %xmm15, %xmm5
; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3]
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; SSE-NEXT: # xmm15 = xmm15[1,2],mem[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[2,3]
; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload
; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
@@ -858,21 +858,21 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
; SSE-NEXT: # xmm14 = xmm14[1,2],mem[2,3]
; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT: # xmm13 = xmm13[1,2],mem[2,3]
+; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; SSE-NEXT: # xmm9 = xmm9[1,2],mem[2,3]
; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3]
; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = xmm0[1,2],mem[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm9[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm10[2,3]
; SSE-NEXT: movaps %xmm1, 352(%rcx)
; SSE-NEXT: movaps %xmm4, 336(%rcx)
; SSE-NEXT: movaps %xmm6, 304(%rcx)
; SSE-NEXT: movaps %xmm7, 288(%rcx)
; SSE-NEXT: movaps %xmm8, 256(%rcx)
-; SSE-NEXT: movaps %xmm10, 240(%rcx)
-; SSE-NEXT: movaps %xmm11, 208(%rcx)
-; SSE-NEXT: movaps %xmm12, 192(%rcx)
+; SSE-NEXT: movaps %xmm11, 240(%rcx)
+; SSE-NEXT: movaps %xmm12, 208(%rcx)
+; SSE-NEXT: movaps %xmm13, 192(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 160(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -895,8 +895,8 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: movaps %xmm0, 320(%rcx)
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3]
; SSE-NEXT: movaps %xmm2, 272(%rcx)
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3]
-; SSE-NEXT: movaps %xmm13, 224(%rcx)
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3]
+; SSE-NEXT: movaps %xmm9, 224(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
; SSE-NEXT: movaps %xmm0, 176(%rcx)
@@ -909,119 +909,119 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: movaps %xmm5, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,3]
; SSE-NEXT: movaps %xmm0, 32(%rcx)
-; SSE-NEXT: addq $168, %rsp
+; SSE-NEXT: addq $152, %rsp
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i32_stride3_vf32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps (%rdx), %ymm14
-; AVX1-NEXT: vmovapd 32(%rdx), %ymm8
+; AVX1-NEXT: vmovapd (%rdx), %ymm0
+; AVX1-NEXT: vmovapd 32(%rdx), %ymm3
; AVX1-NEXT: vmovapd 64(%rdx), %ymm5
-; AVX1-NEXT: vmovapd 96(%rdx), %ymm15
-; AVX1-NEXT: vmovaps (%rsi), %xmm2
+; AVX1-NEXT: vmovapd 96(%rdx), %ymm2
+; AVX1-NEXT: vmovaps (%rsi), %xmm1
; AVX1-NEXT: vmovaps 16(%rsi), %xmm7
; AVX1-NEXT: vmovaps 32(%rsi), %xmm10
-; AVX1-NEXT: vmovapd 48(%rsi), %xmm0
+; AVX1-NEXT: vmovaps 48(%rsi), %xmm9
; AVX1-NEXT: vmovaps (%rdi), %xmm4
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm1
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm8
; AVX1-NEXT: vmovaps 32(%rdi), %xmm11
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm2[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1],xmm3[0,2]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vbroadcastsd (%rdx), %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX1-NEXT: vmovaps 80(%rsi), %xmm3
-; AVX1-NEXT: vmovaps 80(%rdi), %xmm4
-; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm3[3,3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3,2,3]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX1-NEXT: vmovaps 64(%rsi), %xmm3
-; AVX1-NEXT: vmovaps 64(%rdi), %xmm4
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm3[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm3[1,1],xmm6[0,2]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
-; AVX1-NEXT: vbroadcastsd 64(%rdx), %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX1-NEXT: vmovaps 48(%rdi), %xmm3
-; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3],xmm0[3,3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm8[2,3,2,3]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm10[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1],xmm0[0,2]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm11[0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm11[2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT: vbroadcastsd 32(%rdx), %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
-; AVX1-NEXT: vmovaps 112(%rsi), %xmm0
-; AVX1-NEXT: vmovaps 112(%rdi), %xmm3
-; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3],xmm0[3,3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm15[2,3,2,3]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7]
-; AVX1-NEXT: vmovaps 96(%rsi), %xmm0
-; AVX1-NEXT: vmovaps 96(%rdi), %xmm3
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm0[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1],xmm4[0,2]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vbroadcastsd 96(%rdx), %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3],xmm7[3,3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm14[2,3,2,3]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = mem[1,0,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm5[1,1,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = mem[0,0,3,3,4,4,7,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = mem[1,0,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm8[1,1,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = mem[0,0,3,3,4,4,7,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = mem[1,0,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm15[1,1,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm7 = ymm14[1,1,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
-; AVX1-NEXT: vmovaps %ymm6, 32(%rcx)
-; AVX1-NEXT: vmovaps %ymm5, 320(%rcx)
-; AVX1-NEXT: vmovaps %ymm4, 128(%rcx)
-; AVX1-NEXT: vmovaps %ymm3, 224(%rcx)
-; AVX1-NEXT: vmovaps %ymm1, 64(%rcx)
-; AVX1-NEXT: vmovaps %ymm0, 288(%rcx)
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm1[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1],xmm6[0,2]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastsd (%rdx), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
+; AVX1-NEXT: vmovaps 80(%rsi), %xmm4
+; AVX1-NEXT: vmovaps 80(%rdi), %xmm6
+; AVX1-NEXT: vshufps {{.*#+}} xmm12 = xmm6[3,3],xmm4[3,3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[0,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3,2,3]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[0,0,3,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6],ymm6[7]
+; AVX1-NEXT: vmovaps 64(%rsi), %xmm6
+; AVX1-NEXT: vmovaps 64(%rdi), %xmm12
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm6[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm13 = xmm6[1,1],xmm13[0,2]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0],xmm12[2,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6
+; AVX1-NEXT: vbroadcastsd 64(%rdx), %ymm12
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3,4],ymm12[5],ymm6[6,7]
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm12
+; AVX1-NEXT: vshufps {{.*#+}} xmm13 = xmm12[3,3],xmm9[3,3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,1],xmm12[0,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3,2,3]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm12 = ymm12[0,0,3,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm11[1],xmm10[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm12 = xmm10[1,1],xmm12[0,2]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0],xmm11[2,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10
+; AVX1-NEXT: vbroadcastsd 32(%rdx), %ymm11
+; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
+; AVX1-NEXT: vmovaps 112(%rsi), %xmm11
+; AVX1-NEXT: vmovaps 112(%rdi), %xmm12
+; AVX1-NEXT: vshufps {{.*#+}} xmm13 = xmm12[3,3],xmm11[3,3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1],xmm12[0,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,2,3]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm12 = ymm12[0,0,3,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7]
+; AVX1-NEXT: vmovaps 96(%rsi), %xmm12
+; AVX1-NEXT: vmovaps 96(%rdi), %xmm13
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm13[1],xmm12[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm14 = xmm12[1,1],xmm14[0,2]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12
+; AVX1-NEXT: vbroadcastsd 96(%rdx), %ymm13
+; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7]
+; AVX1-NEXT: vshufps {{.*#+}} xmm13 = xmm8[3,3],xmm7[3,3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,2,3]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm8 = ymm8[0,0,3,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7]
+; AVX1-NEXT: vmovaps %ymm0, 32(%rcx)
+; AVX1-NEXT: vmovaps %ymm2, 320(%rcx)
+; AVX1-NEXT: vmovaps %ymm3, 128(%rcx)
+; AVX1-NEXT: vmovaps %ymm5, 224(%rcx)
+; AVX1-NEXT: vmovaps %ymm7, 64(%rcx)
+; AVX1-NEXT: vmovaps %ymm12, 288(%rcx)
; AVX1-NEXT: vmovaps %ymm11, 352(%rcx)
; AVX1-NEXT: vmovaps %ymm10, 96(%rcx)
; AVX1-NEXT: vmovaps %ymm9, 160(%rcx)
-; AVX1-NEXT: vmovaps %ymm13, 192(%rcx)
-; AVX1-NEXT: vmovaps %ymm12, 256(%rcx)
-; AVX1-NEXT: vmovaps %ymm2, (%rcx)
+; AVX1-NEXT: vmovaps %ymm6, 192(%rcx)
+; AVX1-NEXT: vmovaps %ymm4, 256(%rcx)
+; AVX1-NEXT: vmovaps %ymm1, (%rcx)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
index 6edf8e0d9835f..4f69f60eb560e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
@@ -221,97 +221,97 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps (%rsi), %xmm5
-; SSE-NEXT: movaps 16(%rsi), %xmm8
-; SSE-NEXT: movaps (%rdx), %xmm3
+; SSE-NEXT: movaps 16(%rsi), %xmm6
+; SSE-NEXT: movaps (%rdx), %xmm7
; SSE-NEXT: movaps 16(%rdx), %xmm4
-; SSE-NEXT: movaps (%rcx), %xmm6
+; SSE-NEXT: movaps (%rcx), %xmm8
; SSE-NEXT: movaps 16(%rcx), %xmm9
-; SSE-NEXT: movaps %xmm3, %xmm7
-; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; SSE-NEXT: movaps %xmm7, %xmm10
+; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE-NEXT: movaps %xmm2, %xmm10
-; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0]
-; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; SSE-NEXT: movaps %xmm2, %xmm3
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0]
+; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
; SSE-NEXT: movaps %xmm0, %xmm5
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE-NEXT: movaps %xmm4, %xmm3
-; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
-; SSE-NEXT: movaps %xmm1, %xmm6
-; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
-; SSE-NEXT: movaps %xmm6, %xmm7
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; SSE-NEXT: movaps %xmm4, %xmm7
+; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; SSE-NEXT: movaps %xmm1, %xmm8
+; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; SSE-NEXT: movaps %xmm8, %xmm10
+; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0]
; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; SSE-NEXT: movaps %xmm1, %xmm3
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE-NEXT: movaps %xmm1, %xmm6
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1]
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
; SSE-NEXT: movaps %xmm1, 96(%r8)
-; SSE-NEXT: movaps %xmm3, 112(%r8)
-; SSE-NEXT: movaps %xmm6, 64(%r8)
-; SSE-NEXT: movaps %xmm7, 80(%r8)
+; SSE-NEXT: movaps %xmm6, 112(%r8)
+; SSE-NEXT: movaps %xmm8, 64(%r8)
+; SSE-NEXT: movaps %xmm10, 80(%r8)
; SSE-NEXT: movaps %xmm0, 32(%r8)
; SSE-NEXT: movaps %xmm5, 48(%r8)
; SSE-NEXT: movaps %xmm2, (%r8)
-; SSE-NEXT: movaps %xmm10, 16(%r8)
+; SSE-NEXT: movaps %xmm3, 16(%r8)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i32_stride4_vf8:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps (%rdi), %xmm2
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm10
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm0
; AVX1-NEXT: vmovaps (%rsi), %xmm4
; AVX1-NEXT: vmovaps 16(%rsi), %xmm1
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm10[1],xmm1[1],zero,zero
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm8
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[1],xmm1[1],zero,zero
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-NEXT: vmovaps (%rcx), %xmm5
; AVX1-NEXT: vmovaps 16(%rcx), %xmm6
; AVX1-NEXT: vmovaps (%rdx), %xmm7
-; AVX1-NEXT: vmovaps 16(%rdx), %xmm3
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm3[0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm4[1],zero,zero
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm9
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm11 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm7[0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX1-NEXT: vmovaps 16(%rdx), %xmm8
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm10 = xmm6[0],xmm8[0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm9 = xmm2[1],xmm4[1],zero,zero
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm11 = xmm5[0],xmm7[0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,2,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm7[2],xmm5[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,0],xmm2[3,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm6[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,0],xmm10[3,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
-; AVX1-NEXT: vmovaps %ymm1, 96(%r8)
-; AVX1-NEXT: vmovaps %ymm0, 32(%r8)
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm8[2],xmm6[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
+; AVX1-NEXT: vmovaps %ymm0, 96(%r8)
+; AVX1-NEXT: vmovaps %ymm2, 32(%r8)
; AVX1-NEXT: vmovaps %ymm9, (%r8)
-; AVX1-NEXT: vmovaps %ymm8, 64(%r8)
+; AVX1-NEXT: vmovaps %ymm3, 64(%r8)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_i32_stride4_vf8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps (%rdi), %ymm8
+; AVX2-NEXT: vmovaps (%rdi), %ymm0
; AVX2-NEXT: vmovaps (%rsi), %ymm1
; AVX2-NEXT: vmovaps (%rdx), %ymm2
; AVX2-NEXT: vmovaps (%rcx), %ymm3
@@ -320,28 +320,28 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
; AVX2-NEXT: vmovaps (%rsi), %xmm7
-; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; AVX2-NEXT: vmovaps (%rdi), %xmm8
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[4],ymm1[4],ymm8[5],ymm1[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3]
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
-; AVX2-NEXT: vmovaps %ymm1, 96(%r8)
-; AVX2-NEXT: vmovaps %ymm4, 64(%r8)
-; AVX2-NEXT: vmovaps %ymm0, (%r8)
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX2-NEXT: vmovaps %ymm0, 96(%r8)
+; AVX2-NEXT: vmovaps %ymm5, 64(%r8)
+; AVX2-NEXT: vmovaps %ymm4, (%r8)
; AVX2-NEXT: vmovaps %ymm6, 32(%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -378,90 +378,90 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
; SSE-LABEL: store_i32_stride4_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm10
-; SSE-NEXT: movaps 16(%rdi), %xmm13
-; SSE-NEXT: movaps 32(%rdi), %xmm8
-; SSE-NEXT: movaps 48(%rdi), %xmm4
-; SSE-NEXT: movaps (%rsi), %xmm3
-; SSE-NEXT: movaps 16(%rsi), %xmm1
+; SSE-NEXT: movaps (%rdi), %xmm5
+; SSE-NEXT: movaps 16(%rdi), %xmm11
+; SSE-NEXT: movaps 32(%rdi), %xmm4
+; SSE-NEXT: movaps 48(%rdi), %xmm2
+; SSE-NEXT: movaps (%rsi), %xmm0
+; SSE-NEXT: movaps 16(%rsi), %xmm3
; SSE-NEXT: movaps 32(%rsi), %xmm9
-; SSE-NEXT: movaps (%rdx), %xmm0
-; SSE-NEXT: movaps 16(%rdx), %xmm5
-; SSE-NEXT: movaps 32(%rdx), %xmm6
-; SSE-NEXT: movaps (%rcx), %xmm11
+; SSE-NEXT: movaps (%rdx), %xmm7
+; SSE-NEXT: movaps 16(%rdx), %xmm13
+; SSE-NEXT: movaps 32(%rdx), %xmm10
+; SSE-NEXT: movaps (%rcx), %xmm8
; SSE-NEXT: movaps 16(%rcx), %xmm14
; SSE-NEXT: movaps 32(%rcx), %xmm12
-; SSE-NEXT: movaps %xmm0, %xmm7
-; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
-; SSE-NEXT: movaps %xmm10, %xmm15
-; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1]
-; SSE-NEXT: movaps %xmm15, %xmm2
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1]
-; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0]
-; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3]
-; SSE-NEXT: movaps %xmm10, %xmm2
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0]
+; SSE-NEXT: movaps %xmm7, %xmm15
+; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
+; SSE-NEXT: movaps %xmm5, %xmm6
+; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; SSE-NEXT: movaps %xmm6, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1]
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0]
+; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
; SSE-NEXT: movaps %xmm5, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
-; SSE-NEXT: movaps %xmm13, %xmm7
-; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; SSE-NEXT: movaps %xmm7, %xmm2
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
-; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm14[2],xmm5[3],xmm14[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; SSE-NEXT: movaps %xmm13, %xmm11
-; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm5[0]
-; SSE-NEXT: movaps %xmm6, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
-; SSE-NEXT: movaps %xmm8, %xmm5
-; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
-; SSE-NEXT: movaps %xmm5, %xmm14
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
-; SSE-NEXT: movaps 48(%rdx), %xmm0
-; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; SSE-NEXT: movaps %xmm13, %xmm15
+; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
+; SSE-NEXT: movaps %xmm11, %xmm7
+; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; SSE-NEXT: movaps %xmm7, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1]
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm15[0]
+; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm3[2],xmm11[3],xmm3[3]
+; SSE-NEXT: movaps %xmm11, %xmm8
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0]
+; SSE-NEXT: movaps %xmm10, %xmm15
+; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1]
+; SSE-NEXT: movaps %xmm4, %xmm13
+; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
+; SSE-NEXT: movaps %xmm13, %xmm14
+; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0]
+; SSE-NEXT: movaps 48(%rdx), %xmm15
+; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3]
; SSE-NEXT: movaps 48(%rcx), %xmm12
-; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; SSE-NEXT: movaps %xmm8, %xmm9
-; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm6[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0]
-; SSE-NEXT: movaps %xmm0, %xmm6
-; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
-; SSE-NEXT: movaps 48(%rsi), %xmm2
-; SSE-NEXT: movaps %xmm4, %xmm3
-; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT: movaps %xmm3, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
-; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; SSE-NEXT: movaps %xmm4, %xmm2
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; SSE-NEXT: movaps %xmm4, 224(%r8)
-; SSE-NEXT: movaps %xmm2, 240(%r8)
+; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; SSE-NEXT: movaps %xmm4, %xmm9
+; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm10[0]
+; SSE-NEXT: movaps %xmm15, %xmm10
+; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1]
+; SSE-NEXT: movaps 48(%rsi), %xmm1
+; SSE-NEXT: movaps %xmm2, %xmm3
+; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT: movaps %xmm3, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0]
+; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: movaps %xmm2, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0]
+; SSE-NEXT: movaps %xmm2, 224(%r8)
+; SSE-NEXT: movaps %xmm1, 240(%r8)
; SSE-NEXT: movaps %xmm3, 192(%r8)
-; SSE-NEXT: movaps %xmm1, 208(%r8)
-; SSE-NEXT: movaps %xmm8, 160(%r8)
+; SSE-NEXT: movaps %xmm0, 208(%r8)
+; SSE-NEXT: movaps %xmm4, 160(%r8)
; SSE-NEXT: movaps %xmm9, 176(%r8)
-; SSE-NEXT: movaps %xmm5, 128(%r8)
+; SSE-NEXT: movaps %xmm13, 128(%r8)
; SSE-NEXT: movaps %xmm14, 144(%r8)
-; SSE-NEXT: movaps %xmm13, 96(%r8)
-; SSE-NEXT: movaps %xmm11, 112(%r8)
+; SSE-NEXT: movaps %xmm11, 96(%r8)
+; SSE-NEXT: movaps %xmm8, 112(%r8)
; SSE-NEXT: movaps %xmm7, 64(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 80(%r8)
-; SSE-NEXT: movaps %xmm10, 32(%r8)
+; SSE-NEXT: movaps %xmm5, 32(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 48(%r8)
-; SSE-NEXT: movaps %xmm15, (%r8)
+; SSE-NEXT: movaps %xmm6, (%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 16(%r8)
; SSE-NEXT: retq
@@ -469,100 +469,100 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX1-LABEL: store_i32_stride4_vf16:
; AVX1: # %bb.0:
; AVX1-NEXT: subq $24, %rsp
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm7
-; AVX1-NEXT: vmovaps 32(%rdi), %xmm6
-; AVX1-NEXT: vmovaps 48(%rdi), %xmm11
-; AVX1-NEXT: vmovaps 16(%rsi), %xmm13
-; AVX1-NEXT: vmovaps 32(%rsi), %xmm14
-; AVX1-NEXT: vmovaps 48(%rsi), %xmm10
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[1],xmm10[1],zero,zero
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8
-; AVX1-NEXT: vmovaps 16(%rcx), %xmm12
-; AVX1-NEXT: vmovaps 32(%rcx), %xmm3
-; AVX1-NEXT: vmovaps 48(%rcx), %xmm2
-; AVX1-NEXT: vmovaps 16(%rdx), %xmm15
-; AVX1-NEXT: vmovaps 32(%rdx), %xmm1
-; AVX1-NEXT: vmovaps 48(%rdx), %xmm4
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm4[0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,2,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[1],xmm14[1],zero,zero
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm14[0],xmm6[1],xmm14[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm8
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm3[0],xmm1[0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm7[1],xmm13[1],zero,zero
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm13[0],xmm7[1],xmm13[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm12[0],xmm15[0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,0]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm15[0],xmm12[0],xmm15[1],xmm12[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7]
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm0
+; AVX1-NEXT: vmovaps 32(%rdi), %xmm2
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm7
+; AVX1-NEXT: vmovaps 16(%rsi), %xmm1
+; AVX1-NEXT: vmovaps 32(%rsi), %xmm4
+; AVX1-NEXT: vmovaps 48(%rsi), %xmm9
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm7[1],xmm9[1],zero,zero
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT: vmovaps 16(%rcx), %xmm5
+; AVX1-NEXT: vmovaps 32(%rcx), %xmm10
+; AVX1-NEXT: vmovaps 48(%rcx), %xmm13
+; AVX1-NEXT: vmovaps 16(%rdx), %xmm6
+; AVX1-NEXT: vmovaps 32(%rdx), %xmm11
+; AVX1-NEXT: vmovaps 48(%rdx), %xmm14
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm8 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm13[0],xmm14[0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[0,1,2,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5],ymm8[6,7]
+; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm2[1],xmm4[1],zero,zero
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm10[0],xmm11[0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[0,1,2,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm8[2,3],ymm3[4,5],ymm8[6,7]
+; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[1],xmm1[1],zero,zero
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm5[0],xmm6[0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[0,1,2,0]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm12[2,3],ymm3[4,5],ymm12[6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovaps (%rdi), %xmm8
-; AVX1-NEXT: vmovaps (%rsi), %xmm7
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm8[1],xmm7[1],zero,zero
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm9
-; AVX1-NEXT: vmovaps (%rcx), %xmm6
+; AVX1-NEXT: vmovaps (%rdi), %xmm3
+; AVX1-NEXT: vmovaps (%rsi), %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm15 = xmm3[1],xmm1[1],zero,zero
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm8, %ymm8
+; AVX1-NEXT: vmovaps (%rcx), %xmm15
; AVX1-NEXT: vmovaps (%rdx), %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm0[0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,2,0]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[2],xmm6[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,0],xmm8[3,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm4[2],xmm2[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm10[3,0],xmm11[3,0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm0[0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[0,1,2,0]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[2],xmm15[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[3,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,0,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm14[2],xmm13[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm9[3,0],xmm7[3,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm11[2],xmm10[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm7[3,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm1[2],xmm3[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm14[3,0],xmm3[3,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm15[2],xmm12[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm6[2],xmm5[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm5[3,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm6[3,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
; AVX1-NEXT: vmovaps %ymm2, 96(%r8)
; AVX1-NEXT: vmovaps %ymm1, 160(%r8)
; AVX1-NEXT: vmovaps %ymm0, 224(%r8)
-; AVX1-NEXT: vmovaps %ymm8, 32(%r8)
-; AVX1-NEXT: vmovaps %ymm9, (%r8)
+; AVX1-NEXT: vmovaps %ymm3, 32(%r8)
+; AVX1-NEXT: vmovaps %ymm8, (%r8)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm0, 64(%r8)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -575,69 +575,69 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX2-LABEL: store_i32_stride4_vf16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps (%rdi), %ymm9
-; AVX2-NEXT: vmovaps (%rcx), %xmm10
+; AVX2-NEXT: vmovaps (%rdi), %ymm0
+; AVX2-NEXT: vmovaps (%rcx), %xmm4
; AVX2-NEXT: vmovaps 32(%rcx), %xmm3
; AVX2-NEXT: vmovaps (%rdx), %xmm5
; AVX2-NEXT: vmovaps 32(%rdx), %xmm6
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-NEXT: vmovaps (%rsi), %xmm7
-; AVX2-NEXT: vmovaps 32(%rsi), %xmm2
-; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vmovaps 32(%rdi), %xmm4
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm11 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX2-NEXT: vmovaps 32(%rsi), %xmm8
+; AVX2-NEXT: vmovaps (%rdi), %xmm9
+; AVX2-NEXT: vmovaps 32(%rdi), %xmm10
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm7[2],xmm9[3],xmm7[3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
; AVX2-NEXT: vmovaps 32(%rdi), %ymm11
; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
; AVX2-NEXT: vmovaps (%rsi), %ymm6
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX2-NEXT: vmovaps 32(%rsi), %ymm4
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
+; AVX2-NEXT: vmovaps 32(%rsi), %ymm10
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
-; AVX2-NEXT: vmovaps 32(%rdx), %ymm2
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1]
-; AVX2-NEXT: vmovaps 32(%rcx), %ymm10
-; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5],ymm5[6,7]
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[4],ymm10[4],ymm2[5],ymm10[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7]
+; AVX2-NEXT: vmovaps 32(%rdx), %ymm8
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-NEXT: vmovaps 32(%rcx), %ymm5
+; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7]
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3]
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
+; AVX2-NEXT: vmovaps (%rdx), %ymm9
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7]
+; AVX2-NEXT: vmovaps (%rcx), %ymm8
; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
-; AVX2-NEXT: vmovaps (%rdx), %ymm7
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7]
-; AVX2-NEXT: vmovaps (%rcx), %ymm10
-; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[4],ymm10[4],ymm7[5],ymm10[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[4],ymm6[4],ymm9[5],ymm6[5]
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7]
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3]
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5]
; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm7[2],ymm10[2],ymm7[3],ymm10[3],ymm7[6],ymm10[6],ymm7[7],ymm10[7]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
-; AVX2-NEXT: vmovaps %ymm6, 96(%r8)
-; AVX2-NEXT: vmovaps %ymm4, 64(%r8)
-; AVX2-NEXT: vmovaps %ymm2, 224(%r8)
-; AVX2-NEXT: vmovaps %ymm5, 192(%r8)
-; AVX2-NEXT: vmovaps %ymm0, (%r8)
+; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm8 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7]
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm8[0,2,2,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7]
+; AVX2-NEXT: vmovaps %ymm0, 96(%r8)
+; AVX2-NEXT: vmovaps %ymm10, 64(%r8)
+; AVX2-NEXT: vmovaps %ymm5, 224(%r8)
+; AVX2-NEXT: vmovaps %ymm7, 192(%r8)
+; AVX2-NEXT: vmovaps %ymm4, (%r8)
; AVX2-NEXT: vmovaps %ymm3, 160(%r8)
-; AVX2-NEXT: vmovaps %ymm8, 128(%r8)
+; AVX2-NEXT: vmovaps %ymm2, 128(%r8)
; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -695,133 +695,133 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-LABEL: store_i32_stride4_vf32:
; SSE: # %bb.0:
; SSE-NEXT: subq $184, %rsp
-; SSE-NEXT: movaps (%rdi), %xmm14
+; SSE-NEXT: movaps (%rdi), %xmm10
; SSE-NEXT: movaps 16(%rdi), %xmm11
; SSE-NEXT: movaps 32(%rdi), %xmm12
; SSE-NEXT: movaps 48(%rdi), %xmm13
-; SSE-NEXT: movaps (%rsi), %xmm15
-; SSE-NEXT: movaps 16(%rsi), %xmm9
-; SSE-NEXT: movaps 32(%rsi), %xmm8
-; SSE-NEXT: movaps (%rdx), %xmm2
+; SSE-NEXT: movaps (%rsi), %xmm5
+; SSE-NEXT: movaps 16(%rsi), %xmm2
+; SSE-NEXT: movaps 32(%rsi), %xmm0
+; SSE-NEXT: movaps (%rdx), %xmm6
; SSE-NEXT: movaps 16(%rdx), %xmm4
; SSE-NEXT: movaps 32(%rdx), %xmm1
; SSE-NEXT: movaps (%rcx), %xmm7
-; SSE-NEXT: movaps 16(%rcx), %xmm0
-; SSE-NEXT: movaps 32(%rcx), %xmm10
-; SSE-NEXT: movaps %xmm2, %xmm6
-; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; SSE-NEXT: movaps %xmm14, %xmm5
-; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1]
-; SSE-NEXT: movaps %xmm5, %xmm3
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; SSE-NEXT: movaps %xmm14, %xmm5
-; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0]
-; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1]
+; SSE-NEXT: movaps 16(%rcx), %xmm8
+; SSE-NEXT: movaps 32(%rcx), %xmm3
+; SSE-NEXT: movaps %xmm6, %xmm9
+; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
+; SSE-NEXT: movaps %xmm10, %xmm14
+; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
+; SSE-NEXT: movaps %xmm14, %xmm15
+; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0]
+; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1]
; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm4, %xmm2
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT: movaps %xmm11, %xmm3
-; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
-; SSE-NEXT: movaps %xmm3, %xmm5
-; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0]
+; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; SSE-NEXT: movaps %xmm10, %xmm5
+; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0]
; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE-NEXT: movaps %xmm11, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1]
+; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm4, %xmm5
+; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; SSE-NEXT: movaps %xmm11, %xmm6
+; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
+; SSE-NEXT: movaps %xmm6, %xmm7
+; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0]
+; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1]
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3]
+; SSE-NEXT: movaps %xmm11, %xmm2
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1]
; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
-; SSE-NEXT: movaps %xmm12, %xmm2
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
-; SSE-NEXT: movaps %xmm2, %xmm3
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 48(%rdx), %xmm0
-; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3]
-; SSE-NEXT: movaps 48(%rcx), %xmm2
-; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm8[2],xmm12[3],xmm8[3]
-; SSE-NEXT: movaps %xmm12, %xmm3
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
+; SSE-NEXT: movaps %xmm1, %xmm2
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: movaps %xmm12, %xmm4
+; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE-NEXT: movaps %xmm4, %xmm5
+; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0]
+; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 48(%rdx), %xmm2
+; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: movaps 48(%rcx), %xmm3
+; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; SSE-NEXT: movaps %xmm12, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1]
; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: movaps 48(%rsi), %xmm3
-; SSE-NEXT: movaps %xmm13, %xmm15
-; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1]
-; SSE-NEXT: movaps %xmm15, %xmm4
-; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: movaps 48(%rsi), %xmm1
+; SSE-NEXT: movaps %xmm13, %xmm14
+; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
+; SSE-NEXT: movaps %xmm14, %xmm4
+; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1]
-; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3]
-; SSE-NEXT: movaps %xmm13, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
+; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3]
+; SSE-NEXT: movaps %xmm13, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1]
; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 64(%rdx), %xmm0
; SSE-NEXT: movaps 64(%rcx), %xmm2
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE-NEXT: movaps 64(%rdi), %xmm10
-; SSE-NEXT: movaps 64(%rsi), %xmm4
+; SSE-NEXT: movaps 64(%rsi), %xmm5
; SSE-NEXT: movaps %xmm10, %xmm12
-; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1]
; SSE-NEXT: movaps %xmm12, %xmm1
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1]
; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; SSE-NEXT: movaps %xmm10, %xmm1
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
; SSE-NEXT: movaps 80(%rdx), %xmm0
; SSE-NEXT: movaps 80(%rcx), %xmm3
-; SSE-NEXT: movaps %xmm0, %xmm4
-; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: movaps %xmm0, %xmm6
+; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; SSE-NEXT: movaps 80(%rdi), %xmm5
-; SSE-NEXT: movaps 80(%rsi), %xmm6
-; SSE-NEXT: movaps %xmm5, %xmm14
-; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1]
-; SSE-NEXT: movaps %xmm14, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE-NEXT: movaps 80(%rsi), %xmm7
+; SSE-NEXT: movaps %xmm5, %xmm15
+; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1]
+; SSE-NEXT: movaps %xmm15, %xmm1
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm6[1]
; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3]
; SSE-NEXT: movaps %xmm5, %xmm1
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
; SSE-NEXT: movaps 96(%rdx), %xmm1
-; SSE-NEXT: movaps 96(%rcx), %xmm6
+; SSE-NEXT: movaps 96(%rcx), %xmm4
; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE-NEXT: movaps 96(%rdi), %xmm3
; SSE-NEXT: movaps 96(%rsi), %xmm7
-; SSE-NEXT: movaps %xmm3, %xmm4
-; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; SSE-NEXT: movaps %xmm4, %xmm13
+; SSE-NEXT: movaps %xmm3, %xmm6
+; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; SSE-NEXT: movaps %xmm6, %xmm13
; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
-; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
+; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3]
; SSE-NEXT: movaps %xmm3, %xmm11
; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0]
@@ -834,8 +834,8 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: movaps 112(%rsi), %xmm8
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
-; SSE-NEXT: movaps %xmm1, %xmm6
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSE-NEXT: movaps %xmm1, %xmm4
+; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1]
; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
@@ -845,15 +845,15 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: movaps %xmm0, 496(%r8)
; SSE-NEXT: movaps %xmm7, 480(%r8)
; SSE-NEXT: movaps %xmm1, 464(%r8)
-; SSE-NEXT: movaps %xmm6, 448(%r8)
+; SSE-NEXT: movaps %xmm4, 448(%r8)
; SSE-NEXT: movaps %xmm3, 432(%r8)
; SSE-NEXT: movaps %xmm11, 416(%r8)
-; SSE-NEXT: movaps %xmm4, 400(%r8)
+; SSE-NEXT: movaps %xmm6, 400(%r8)
; SSE-NEXT: movaps %xmm13, 384(%r8)
; SSE-NEXT: movaps %xmm5, 368(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 352(%r8)
-; SSE-NEXT: movaps %xmm14, 336(%r8)
+; SSE-NEXT: movaps %xmm15, 336(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 320(%r8)
; SSE-NEXT: movaps %xmm10, 304(%r8)
@@ -866,7 +866,7 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: movaps %xmm0, 240(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 224(%r8)
-; SSE-NEXT: movaps %xmm15, 208(%r8)
+; SSE-NEXT: movaps %xmm14, 208(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 192(%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -923,12 +923,12 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovaps 64(%rcx), %xmm3
-; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 64(%rdx), %xmm2
+; AVX1-NEXT: vmovaps 64(%rcx), %xmm2
; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vmovaps 64(%rdx), %xmm3
+; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
@@ -942,10 +942,10 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovaps 80(%rcx), %xmm2
; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 80(%rdx), %xmm7
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm7[0]
+; AVX1-NEXT: vmovaps 80(%rdx), %xmm12
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -956,41 +956,41 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovaps 32(%rcx), %xmm13
-; AVX1-NEXT: vmovaps 32(%rdx), %xmm10
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm10[0]
+; AVX1-NEXT: vmovaps 32(%rcx), %xmm10
+; AVX1-NEXT: vmovaps 32(%rdx), %xmm8
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovaps 48(%rdi), %xmm2
-; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
; AVX1-NEXT: vmovaps 48(%rsi), %xmm1
; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovaps 48(%rcx), %xmm11
-; AVX1-NEXT: vmovaps 48(%rdx), %xmm9
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm9[0]
+; AVX1-NEXT: vmovaps 48(%rcx), %xmm9
+; AVX1-NEXT: vmovaps 48(%rdx), %xmm7
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm7[0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovaps 96(%rdi), %xmm2
; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps 96(%rsi), %xmm1
-; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovaps 96(%rcx), %xmm12
+; AVX1-NEXT: vmovaps 96(%rcx), %xmm6
; AVX1-NEXT: vmovaps 96(%rdx), %xmm5
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm5[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm5[0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1005,120 +1005,118 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX1-NEXT: vmovaps 112(%rdx), %xmm3
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovaps (%rdi), %xmm2
-; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps (%rsi), %xmm1
+; AVX1-NEXT: vmovaps (%rdi), %xmm1
; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX1-NEXT: vmovaps (%rsi), %xmm13
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm13[1],zero,zero
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovaps (%rcx), %xmm8
+; AVX1-NEXT: vmovaps (%rcx), %xmm11
; AVX1-NEXT: vmovaps (%rdx), %xmm2
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm8[0],xmm2[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm2[0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,0]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14
-; AVX1-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7]
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm6[2]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm1
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm0[3,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm14, %ymm6
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7]
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm7[2],xmm1[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
+; AVX1-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7]
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,0],xmm7[3,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm13[2],xmm10[3],xmm13[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm10[2],xmm13[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm12[2],xmm1[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm9[2],xmm11[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm1[2],xmm14[3],xmm1[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm14 = xmm1[3,0],xmm14[3,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[2,0,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm10[2],xmm8[3],xmm10[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm8[2],xmm10[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,0],xmm1[3,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm12[2],xmm5[3],xmm12[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm12[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm12[2],xmm1[3],xmm12[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,0],xmm1[3,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,0,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10
+; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm9[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7
; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm9 = xmm1[2],xmm10[2],xmm1[3],xmm10[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],xmm1[3,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[2,0,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm9 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm6[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm1[2],xmm9[3],xmm1[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm9 = xmm1[3,0],xmm9[3,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,0,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,0],xmm6[3,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm11[2],xmm2[3],xmm11[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm11[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[3,0],xmm4[3,0]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm13[2],xmm4[3],xmm13[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm13[3,0],xmm4[3,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm8[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,0],xmm3[3,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm6[3,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm4[2],xmm6[2]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm9[3,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
; AVX1-NEXT: vmovaps %ymm2, 96(%r8)
; AVX1-NEXT: vmovaps %ymm1, 32(%r8)
-; AVX1-NEXT: vmovaps %ymm0, 480(%r8)
+; AVX1-NEXT: vmovaps %ymm3, 480(%r8)
; AVX1-NEXT: vmovaps %ymm5, 416(%r8)
-; AVX1-NEXT: vmovaps %ymm9, 224(%r8)
-; AVX1-NEXT: vmovaps %ymm10, 160(%r8)
-; AVX1-NEXT: vmovaps %ymm14, 352(%r8)
+; AVX1-NEXT: vmovaps %ymm7, 224(%r8)
+; AVX1-NEXT: vmovaps %ymm8, 160(%r8)
+; AVX1-NEXT: vmovaps %ymm0, 352(%r8)
+; AVX1-NEXT: vmovaps %ymm15, 288(%r8)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT: vmovaps %ymm0, 288(%r8)
-; AVX1-NEXT: vmovaps %ymm15, (%r8)
+; AVX1-NEXT: vmovaps %ymm0, (%r8)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm0, 448(%r8)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -1146,133 +1144,133 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps 64(%rdx), %ymm6
; AVX2-NEXT: vmovaps (%rdx), %ymm2
-; AVX2-NEXT: vmovaps (%rcx), %xmm13
-; AVX2-NEXT: vmovaps 32(%rcx), %xmm10
-; AVX2-NEXT: vmovaps 64(%rcx), %xmm11
-; AVX2-NEXT: vmovaps (%rdx), %xmm14
-; AVX2-NEXT: vmovaps 32(%rdx), %xmm12
-; AVX2-NEXT: vmovaps 64(%rdx), %xmm3
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm11[0],xmm3[1],xmm11[1]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-NEXT: vmovaps 32(%rsi), %xmm4
-; AVX2-NEXT: vmovaps 64(%rsi), %xmm7
-; AVX2-NEXT: vmovaps 32(%rdi), %xmm0
-; AVX2-NEXT: vmovaps 64(%rdi), %xmm5
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3]
+; AVX2-NEXT: vmovaps (%rcx), %xmm9
+; AVX2-NEXT: vmovaps 32(%rcx), %xmm7
+; AVX2-NEXT: vmovaps 64(%rcx), %xmm4
+; AVX2-NEXT: vmovaps (%rdx), %xmm11
+; AVX2-NEXT: vmovaps 32(%rdx), %xmm8
+; AVX2-NEXT: vmovaps 64(%rdx), %xmm5
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; AVX2-NEXT: vmovaps 32(%rsi), %xmm10
+; AVX2-NEXT: vmovaps 64(%rsi), %xmm12
+; AVX2-NEXT: vmovaps 32(%rdi), %xmm13
+; AVX2-NEXT: vmovaps 64(%rdi), %xmm14
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm12[2],xmm14[3],xmm12[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm10[2],xmm13[3],xmm10[3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX2-NEXT: vmovaps 96(%rcx), %xmm10
-; AVX2-NEXT: vmovaps 96(%rdx), %xmm3
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm10[0],xmm3[1],xmm10[1]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,0,2,1]
-; AVX2-NEXT: vmovaps 96(%rsi), %xmm4
-; AVX2-NEXT: vmovaps 96(%rdi), %xmm0
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; AVX2-NEXT: vmovaps 96(%rdx), %xmm12
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1]
+; AVX2-NEXT: vmovaps 96(%rsi), %xmm13
+; AVX2-NEXT: vmovaps 96(%rdi), %xmm14
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps (%rsi), %xmm15
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; AVX2-NEXT: vmovaps (%rdi), %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps (%rsi), %xmm1
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3]
-; AVX2-NEXT: vmovaps (%rdi), %xmm10
-; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7]
-; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
-; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 64(%rcx), %ymm0
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,0,2,1]
+; AVX2-NEXT: vunpckhps {{.*#+}} xmm13 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
+; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 64(%rcx), %ymm1
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
; AVX2-NEXT: vmovaps (%rcx), %ymm13
-; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
-; AVX2-NEXT: vmovaps %ymm2, %ymm5
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[4],ymm13[4],ymm2[5],ymm13[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7]
+; AVX2-NEXT: vmovaps %ymm2, %ymm8
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[4],ymm13[4],ymm2[5],ymm13[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7]
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovaps 64(%rdi), %ymm10
-; AVX2-NEXT: vmovaps 64(%rsi), %ymm14
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7]
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7]
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[4],ymm1[4],ymm6[5],ymm1[5]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7]
-; AVX2-NEXT: vmovaps 32(%rdx), %ymm6
-; AVX2-NEXT: vmovaps 32(%rcx), %ymm9
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3]
-; AVX2-NEXT: vmovaps 32(%rdi), %ymm14
+; AVX2-NEXT: vmovaps 64(%rdi), %ymm15
+; AVX2-NEXT: vmovaps 64(%rsi), %ymm14
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[4],ymm14[4],ymm15[5],ymm14[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[6],ymm1[6],ymm6[7],ymm1[7]
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7]
+; AVX2-NEXT: vmovaps 32(%rdx), %ymm3
+; AVX2-NEXT: vmovaps 32(%rcx), %ymm6
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,2,3]
+; AVX2-NEXT: vmovaps 32(%rdi), %ymm15
; AVX2-NEXT: vmovaps 32(%rsi), %ymm0
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3]
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7]
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7]
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7]
-; AVX2-NEXT: vmovaps 96(%rdx), %ymm6
-; AVX2-NEXT: vmovaps 96(%rcx), %ymm9
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX2-NEXT: vmovaps 96(%rdi), %ymm14
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7]
+; AVX2-NEXT: vmovaps 96(%rdx), %ymm3
+; AVX2-NEXT: vmovaps 96(%rcx), %ymm6
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,2,3]
+; AVX2-NEXT: vmovaps 96(%rdi), %ymm15
; AVX2-NEXT: vmovaps 96(%rsi), %ymm0
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm8 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3]
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7]
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7]
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7]
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
-; AVX2-NEXT: vmovaps %ymm6, 96(%r8)
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7]
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm13[2],ymm8[3],ymm13[3],ymm8[6],ymm13[6],ymm8[7],ymm13[7]
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7]
+; AVX2-NEXT: vmovaps %ymm3, 96(%r8)
; AVX2-NEXT: vmovaps %ymm0, 480(%r8)
-; AVX2-NEXT: vmovaps %ymm8, 448(%r8)
+; AVX2-NEXT: vmovaps %ymm7, 448(%r8)
; AVX2-NEXT: vmovaps %ymm1, 224(%r8)
-; AVX2-NEXT: vmovaps %ymm10, 192(%r8)
-; AVX2-NEXT: vmovaps %ymm4, 352(%r8)
-; AVX2-NEXT: vmovaps %ymm7, 320(%r8)
-; AVX2-NEXT: vmovaps %ymm12, 64(%r8)
-; AVX2-NEXT: vmovaps %ymm15, (%r8)
+; AVX2-NEXT: vmovaps %ymm4, 192(%r8)
+; AVX2-NEXT: vmovaps %ymm9, 352(%r8)
+; AVX2-NEXT: vmovaps %ymm10, 320(%r8)
+; AVX2-NEXT: vmovaps %ymm11, 64(%r8)
+; AVX2-NEXT: vmovaps %ymm12, (%r8)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index e3175a82992bc..56d8e32ad449d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -176,36 +176,36 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE: # %bb.0:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: movaps (%rsi), %xmm8
+; SSE-NEXT: movaps (%rsi), %xmm2
; SSE-NEXT: movaps (%rdx), %xmm1
-; SSE-NEXT: movaps (%rcx), %xmm9
-; SSE-NEXT: movaps (%r8), %xmm2
+; SSE-NEXT: movaps (%rcx), %xmm4
+; SSE-NEXT: movaps (%r8), %xmm5
; SSE-NEXT: movaps (%r9), %xmm6
; SSE-NEXT: movaps %xmm1, %xmm3
-; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
-; SSE-NEXT: movaps %xmm2, %xmm4
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1]
-; SSE-NEXT: movaps %xmm2, %xmm5
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; SSE-NEXT: movaps %xmm2, %xmm7
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3]
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3]
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm4[0,2]
-; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
+; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movaps %xmm5, %xmm7
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm6[1,1]
+; SSE-NEXT: movaps %xmm5, %xmm8
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1]
+; SSE-NEXT: movaps %xmm5, %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm6[3,3]
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; SSE-NEXT: movaps %xmm0, %xmm5
+; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3]
+; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm7[0,2]
+; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[2,3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm7[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm9[0,2]
; SSE-NEXT: movaps %xmm6, 16(%rax)
; SSE-NEXT: movaps %xmm0, 48(%rax)
; SSE-NEXT: movaps %xmm1, 80(%rax)
-; SSE-NEXT: movaps %xmm5, 64(%rax)
+; SSE-NEXT: movaps %xmm8, 64(%rax)
; SSE-NEXT: movaps %xmm3, 32(%rax)
-; SSE-NEXT: movaps %xmm2, (%rax)
+; SSE-NEXT: movaps %xmm5, (%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i32_stride6_vf4:
@@ -218,22 +218,22 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-NEXT: vmovaps (%r8), %xmm4
; AVX1-NEXT: vmovaps (%r9), %xmm5
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm13
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm7
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm9
; AVX1-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm11
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2]
; AVX1-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5]
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm3[0,0],xmm1[0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7]
+; AVX1-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,0],xmm1[0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7]
; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm10
-; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm13[1,2],ymm10[1,2],ymm13[5,6],ymm10[5,6]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5]
-; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5],ymm10[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm11
+; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm11[1,2],ymm7[5,6],ymm11[5,6]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
@@ -245,7 +245,7 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7]
; AVX1-NEXT: vmovaps %ymm0, 64(%rax)
; AVX1-NEXT: vmovaps %ymm4, 32(%rax)
-; AVX1-NEXT: vmovaps %ymm7, (%rax)
+; AVX1-NEXT: vmovaps %ymm10, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -335,73 +335,71 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE: # %bb.0:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movaps (%rdi), %xmm2
-; SSE-NEXT: movaps 16(%rdi), %xmm8
-; SSE-NEXT: movaps (%rsi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 16(%rsi), %xmm13
-; SSE-NEXT: movaps (%rdx), %xmm4
+; SSE-NEXT: movaps 16(%rdi), %xmm0
+; SSE-NEXT: movaps (%rsi), %xmm9
+; SSE-NEXT: movaps 16(%rsi), %xmm12
+; SSE-NEXT: movaps (%rdx), %xmm3
; SSE-NEXT: movaps 16(%rdx), %xmm1
; SSE-NEXT: movaps (%rcx), %xmm11
-; SSE-NEXT: movaps 16(%rcx), %xmm15
+; SSE-NEXT: movaps 16(%rcx), %xmm13
; SSE-NEXT: movaps (%r8), %xmm10
-; SSE-NEXT: movaps 16(%r8), %xmm0
+; SSE-NEXT: movaps 16(%r8), %xmm14
; SSE-NEXT: movaps (%r9), %xmm7
; SSE-NEXT: movaps 16(%r9), %xmm5
-; SSE-NEXT: movaps %xmm0, %xmm6
+; SSE-NEXT: movaps %xmm14, %xmm6
; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm5[3,3]
-; SSE-NEXT: movaps %xmm1, %xmm9
-; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm6[0,2]
-; SSE-NEXT: movaps %xmm8, %xmm14
-; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; SSE-NEXT: movaps %xmm0, %xmm12
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1]
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm14[2,3]
-; SSE-NEXT: movaps %xmm15, %xmm3
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,0]
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,3]
-; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm3[0,2]
-; SSE-NEXT: movaps %xmm10, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3]
-; SSE-NEXT: movaps %xmm4, %xmm13
-; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm0[0,2]
-; SSE-NEXT: movaps %xmm2, %xmm0
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE-NEXT: movaps %xmm10, %xmm3
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3]
-; SSE-NEXT: movaps %xmm11, %xmm6
-; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,0]
-; SSE-NEXT: movaps %xmm10, %xmm6
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm7[1,1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1]
+; SSE-NEXT: movaps %xmm1, %xmm4
+; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm6[0,2]
+; SSE-NEXT: movaps %xmm0, %xmm6
+; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; SSE-NEXT: movaps %xmm14, %xmm8
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[2,3]
+; SSE-NEXT: movaps %xmm13, %xmm15
+; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,0]
+; SSE-NEXT: movaps %xmm14, %xmm15
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm5[1,1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm15[0,2]
+; SSE-NEXT: movaps %xmm10, %xmm13
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm7[3,3]
+; SSE-NEXT: movaps %xmm3, %xmm12
+; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm13[0,2]
+; SSE-NEXT: movaps %xmm2, %xmm13
+; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm9[2],xmm13[3],xmm9[3]
+; SSE-NEXT: movaps %xmm10, %xmm14
+; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1]
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[2,3]
+; SSE-NEXT: movaps %xmm11, %xmm15
+; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm3[1]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,0]
+; SSE-NEXT: movaps %xmm10, %xmm15
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm7[1,1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0]
; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[2,3]
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm6[0,2]
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm15[0,2]
; SSE-NEXT: movaps %xmm2, (%rax)
; SSE-NEXT: movaps %xmm7, 16(%rax)
-; SSE-NEXT: movaps %xmm4, 32(%rax)
-; SSE-NEXT: movaps %xmm0, 48(%rax)
-; SSE-NEXT: movaps %xmm3, 64(%rax)
-; SSE-NEXT: movaps %xmm13, 80(%rax)
-; SSE-NEXT: movaps %xmm8, 96(%rax)
+; SSE-NEXT: movaps %xmm3, 32(%rax)
+; SSE-NEXT: movaps %xmm13, 48(%rax)
+; SSE-NEXT: movaps %xmm14, 64(%rax)
+; SSE-NEXT: movaps %xmm12, 80(%rax)
+; SSE-NEXT: movaps %xmm0, 96(%rax)
; SSE-NEXT: movaps %xmm5, 112(%rax)
; SSE-NEXT: movaps %xmm1, 128(%rax)
-; SSE-NEXT: movaps %xmm14, 144(%rax)
-; SSE-NEXT: movaps %xmm12, 160(%rax)
-; SSE-NEXT: movaps %xmm9, 176(%rax)
+; SSE-NEXT: movaps %xmm6, 144(%rax)
+; SSE-NEXT: movaps %xmm8, 160(%rax)
+; SSE-NEXT: movaps %xmm4, 176(%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i32_stride6_vf8:
@@ -412,29 +410,29 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-NEXT: vmovaps (%rdx), %ymm5
; AVX1-NEXT: vmovaps (%rcx), %ymm6
; AVX1-NEXT: vmovaps (%r8), %ymm3
-; AVX1-NEXT: vmovaps (%rcx), %xmm13
-; AVX1-NEXT: vmovaps (%rdx), %xmm14
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,2],xmm13[1,2]
+; AVX1-NEXT: vmovaps (%rcx), %xmm1
+; AVX1-NEXT: vmovaps (%rdx), %xmm2
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,2],xmm1[1,2]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9
-; AVX1-NEXT: vmovaps (%rsi), %xmm0
-; AVX1-NEXT: vmovaps (%rdi), %xmm7
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm10
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7]
-; AVX1-NEXT: vbroadcastss 4(%r8), %xmm10
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7]
-; AVX1-NEXT: vbroadcastss 4(%r9), %ymm10
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7]
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps (%rsi), %xmm9
+; AVX1-NEXT: vmovaps (%rdi), %xmm10
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7]
+; AVX1-NEXT: vbroadcastss 4(%r8), %xmm7
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vbroadcastss 4(%r9), %ymm7
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3],ymm0[4,5,6,7]
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
; AVX1-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm3[4,5],ymm10[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5],ymm7[6,7]
; AVX1-NEXT: vbroadcastss 16(%r9), %ymm12
-; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm12[5],ymm7[6,7]
; AVX1-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7]
; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,2],ymm6[1,2],ymm5[5,6],ymm6[5,6]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3]
@@ -444,281 +442,281 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7]
; AVX1-NEXT: vbroadcastss 20(%r9), %ymm12
; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm12[3],ymm8[4,5,6,7]
-; AVX1-NEXT: vbroadcastss (%rcx), %xmm1
-; AVX1-NEXT: vbroadcastss (%rdx), %xmm2
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, (%r8), %ymm0, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX1-NEXT: vbroadcastss (%r9), %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
+; AVX1-NEXT: vbroadcastss (%rcx), %xmm12
+; AVX1-NEXT: vbroadcastss (%rdx), %xmm13
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm10
+; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
+; AVX1-NEXT: vbroadcastss (%r9), %ymm10
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm11[2,3],ymm5[2,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5],ymm3[6,7]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6],ymm5[7]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT: vmovaps (%r9), %xmm2
+; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[0,2,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7]
-; AVX1-NEXT: vmovaps (%r9), %xmm3
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[0,2,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7]
-; AVX1-NEXT: vmovaps %ymm2, 64(%rax)
-; AVX1-NEXT: vmovaps %ymm1, 160(%rax)
-; AVX1-NEXT: vmovaps %ymm0, (%rax)
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7]
+; AVX1-NEXT: vmovaps %ymm1, 64(%rax)
+; AVX1-NEXT: vmovaps %ymm3, 160(%rax)
+; AVX1-NEXT: vmovaps %ymm9, (%rax)
; AVX1-NEXT: vmovaps %ymm8, 128(%rax)
-; AVX1-NEXT: vmovaps %ymm10, 96(%rax)
-; AVX1-NEXT: vmovaps %ymm9, 32(%rax)
+; AVX1-NEXT: vmovaps %ymm7, 96(%rax)
+; AVX1-NEXT: vmovaps %ymm0, 32(%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: store_i32_stride6_vf8:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm10
-; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm12
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1
; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3
; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4
-; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm13
-; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2
-; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm15
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[0,1,2,2]
-; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
-; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm0
-; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm7
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm6
-; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm7
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm2
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %ymm6
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,2,4,5,6,6]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,2,3,5,5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm1[4,5],ymm6[6,7]
-; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = mem[0],zero,mem[1],zero
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm7
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7]
-; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,1,3,3,6,5,7,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7]
-; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm14, %ymm5
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7]
-; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm3
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax)
+; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2
+; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11
+; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5
+; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm7
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,2,2]
+; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm8
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[1,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7]
+; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10
+; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm12
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm12
+; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm13
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm11
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7]
+; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm12, %ymm11
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7]
+; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7]
+; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm14
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[2,1,3,3,6,5,7,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = mem[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7]
+; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,2,3,3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,2,3,3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm1
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm6, 64(%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm13, 160(%rax)
; AVX2-SLOW-NEXT: vmovdqa %ymm11, 128(%rax)
; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rax)
-; AVX2-SLOW-NEXT: vmovdqa %ymm8, 32(%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-ALL-LABEL: store_i32_stride6_vf8:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm9
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm11
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm1
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm3
; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %ymm4
; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm2
-; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %ymm12
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm5
-; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm8
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm14
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,1,2,2]
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm15
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2],xmm7[3]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5],ymm6[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm7
-; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastd 4(%r9), %ymm8
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rcx), %xmm6
-; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rdx), %xmm0
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm7, %ymm1
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastd (%r9), %ymm1
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[1,1,2,3,5,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4],ymm1[5],ymm5[6],ymm1[7]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastd 20(%r9), %ymm5
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3],ymm1[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <6,u,u,u,u,u,7,u>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm5, %ymm5
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <u,6,u,u,u,u,u,7>
-; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm5, %ymm5
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7]
-; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm5, %ymm13, %ymm5
-; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
-; AVX2-FAST-ALL-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm6, %ymm7
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm6, %ymm6
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6],ymm6[7]
+; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %ymm5
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm10
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm11
+; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm8
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,1,2,2]
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm9
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[1,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm12
+; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastd 4(%r9), %ymm13
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rcx), %xmm13
+; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rdx), %xmm14
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm12, %ymm11
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastd (%r9), %ymm11
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,2,3,5,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7]
+; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastd 20(%r9), %ymm13
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7]
+; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = <6,u,u,u,u,u,7,u>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm13, %ymm13
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = <u,6,u,u,u,u,u,7>
+; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm13, %ymm13
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3,4,5,6],ymm13[7]
+; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
+; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,2,3,3,2,2,3,3]
+; AVX2-FAST-ALL-NEXT: # ymm8 = mem[0,1,0,1]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm8, %ymm9
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm8, %ymm5
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5,6],ymm5[7]
; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastd 16(%r9), %ymm3
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, 96(%rax)
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastd 16(%r9), %ymm1
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 96(%rax)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, 64(%rax)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 160(%rax)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 128(%rax)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm11, 160(%rax)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm12, 128(%rax)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm10, (%rax)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm8, 32(%rax)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, 32(%rax)
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX2-FAST-ALL-NEXT: retq
;
; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf8:
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm10
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm13
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2
-; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm15
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[0,1,2,2]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm6
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm2
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm2, %ymm6
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,2,4,5,6,6]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,2,3,5,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm1[4,5],ymm6[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = mem[0],zero,mem[1],zero
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7]
-; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,1,3,3,6,5,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm14, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm9
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm11
+; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm8
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[1,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10
+; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm12
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm12
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm13
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm12, %ymm11
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm14
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[2,1,3,3,6,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = mem[0,2,2,3,4,6,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,2,3,3]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,2,3,3]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 160(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 128(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 32(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
@@ -772,107 +770,107 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-LABEL: store_i32_stride6_vf16:
; SSE: # %bb.0:
; SSE-NEXT: subq $72, %rsp
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: movaps 16(%rdi), %xmm10
-; SSE-NEXT: movaps (%rsi), %xmm11
-; SSE-NEXT: movaps 16(%rsi), %xmm8
-; SSE-NEXT: movaps (%rdx), %xmm13
-; SSE-NEXT: movaps 16(%rdx), %xmm12
-; SSE-NEXT: movaps (%rcx), %xmm3
-; SSE-NEXT: movaps 16(%rcx), %xmm9
-; SSE-NEXT: movaps (%r8), %xmm2
-; SSE-NEXT: movaps 16(%r8), %xmm7
-; SSE-NEXT: movaps (%r9), %xmm1
-; SSE-NEXT: movaps 16(%r9), %xmm14
-; SSE-NEXT: movaps %xmm13, %xmm4
-; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE-NEXT: movaps %xmm0, %xmm5
-; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1]
-; SSE-NEXT: movaps %xmm1, %xmm6
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3]
-; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm2, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm5[0,2]
+; SSE-NEXT: movaps (%rdi), %xmm4
+; SSE-NEXT: movaps 16(%rdi), %xmm5
+; SSE-NEXT: movaps (%rsi), %xmm8
+; SSE-NEXT: movaps 16(%rsi), %xmm11
+; SSE-NEXT: movaps (%rdx), %xmm6
+; SSE-NEXT: movaps 16(%rdx), %xmm7
+; SSE-NEXT: movaps (%rcx), %xmm1
+; SSE-NEXT: movaps 16(%rcx), %xmm14
+; SSE-NEXT: movaps (%r8), %xmm9
+; SSE-NEXT: movaps 16(%r8), %xmm15
+; SSE-NEXT: movaps (%r9), %xmm2
+; SSE-NEXT: movaps 16(%r9), %xmm0
+; SSE-NEXT: movaps %xmm6, %xmm10
+; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
+; SSE-NEXT: movaps %xmm4, %xmm3
+; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; SSE-NEXT: movaps %xmm2, %xmm12
+; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[2,3]
+; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0]
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm9, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[0,2]
+; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm1, %xmm3
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1]
+; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; SSE-NEXT: movaps %xmm9, %xmm8
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm4[2,3]
+; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm3, %xmm5
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1]
-; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
-; SSE-NEXT: movaps %xmm2, %xmm4
-; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[2,3]
-; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3]
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2]
-; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm12, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
-; SSE-NEXT: movaps %xmm10, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
-; SSE-NEXT: movaps %xmm14, %xmm2
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
-; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm2[3,3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm9[0,2]
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm7, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm14[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,2]
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
+; SSE-NEXT: movaps %xmm5, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
+; SSE-NEXT: movaps %xmm0, %xmm3
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm9, %xmm2
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1]
-; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; SSE-NEXT: movaps %xmm7, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0]
-; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 32(%rdi), %xmm10
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm14[3,3]
-; SSE-NEXT: movaps 32(%rdx), %xmm11
-; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
+; SSE-NEXT: movaps %xmm15, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2]
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm14, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1]
+; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm11[2],xmm5[3],xmm11[3]
+; SSE-NEXT: movaps %xmm15, %xmm2
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,3]
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0]
+; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 32(%rdi), %xmm12
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm0[3,3]
+; SSE-NEXT: movaps 32(%rdx), %xmm13
+; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3]
; SSE-NEXT: movaps 32(%rcx), %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm7[0,2]
-; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm11, %xmm12
-; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm15[0,2]
+; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm13, %xmm15
+; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
; SSE-NEXT: movaps 32(%rsi), %xmm1
-; SSE-NEXT: movaps %xmm10, %xmm13
-; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
+; SSE-NEXT: movaps %xmm12, %xmm14
+; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
; SSE-NEXT: movaps 32(%r8), %xmm2
-; SSE-NEXT: movaps 32(%r9), %xmm8
-; SSE-NEXT: movaps %xmm8, %xmm15
-; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0]
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3]
-; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0]
+; SSE-NEXT: movaps 32(%r9), %xmm4
+; SSE-NEXT: movaps %xmm4, %xmm11
+; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm14[2,3]
+; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0]
; SSE-NEXT: movaps %xmm2, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm8[1,1]
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm3[0,2]
-; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1]
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[0,2]
+; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3]
; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1]
-; SSE-NEXT: movaps %xmm2, %xmm9
-; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1]
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm10[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3]
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1]
+; SSE-NEXT: movaps %xmm2, %xmm8
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1]
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm12[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2]
; SSE-NEXT: movaps 48(%rdx), %xmm2
-; SSE-NEXT: movaps 48(%rcx), %xmm8
+; SSE-NEXT: movaps 48(%rcx), %xmm9
; SSE-NEXT: movaps %xmm2, %xmm4
-; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
; SSE-NEXT: movaps 48(%rdi), %xmm0
-; SSE-NEXT: movaps 48(%rsi), %xmm14
+; SSE-NEXT: movaps 48(%rsi), %xmm10
; SSE-NEXT: movaps %xmm0, %xmm5
-; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1]
; SSE-NEXT: movaps 48(%r8), %xmm3
; SSE-NEXT: movaps 48(%r9), %xmm7
; SSE-NEXT: movaps %xmm7, %xmm6
@@ -882,29 +880,29 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: movaps %xmm3, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1]
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2]
-; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE-NEXT: movaps %xmm8, %xmm14
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1]
-; SSE-NEXT: movaps %xmm3, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,0]
+; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
+; SSE-NEXT: movaps %xmm9, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE-NEXT: movaps %xmm3, %xmm10
+; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1]
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3]
-; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[0,2]
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movaps %xmm2, 368(%rax)
-; SSE-NEXT: movaps %xmm1, 352(%rax)
+; SSE-NEXT: movaps %xmm10, 352(%rax)
; SSE-NEXT: movaps %xmm0, 336(%rax)
; SSE-NEXT: movaps %xmm4, 320(%rax)
; SSE-NEXT: movaps %xmm6, 304(%rax)
; SSE-NEXT: movaps %xmm5, 288(%rax)
-; SSE-NEXT: movaps %xmm11, 272(%rax)
-; SSE-NEXT: movaps %xmm9, 256(%rax)
-; SSE-NEXT: movaps %xmm10, 240(%rax)
-; SSE-NEXT: movaps %xmm12, 224(%rax)
-; SSE-NEXT: movaps %xmm15, 208(%rax)
-; SSE-NEXT: movaps %xmm13, 192(%rax)
+; SSE-NEXT: movaps %xmm13, 272(%rax)
+; SSE-NEXT: movaps %xmm8, 256(%rax)
+; SSE-NEXT: movaps %xmm12, 240(%rax)
+; SSE-NEXT: movaps %xmm15, 224(%rax)
+; SSE-NEXT: movaps %xmm11, 208(%rax)
+; SSE-NEXT: movaps %xmm14, 192(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 176(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -935,7 +933,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX1-LABEL: store_i32_stride6_vf16:
; AVX1: # %bb.0:
; AVX1-NEXT: subq $136, %rsp
-; AVX1-NEXT: vmovaps (%rdi), %ymm8
+; AVX1-NEXT: vmovaps (%rdi), %ymm6
; AVX1-NEXT: vmovaps 32(%rdi), %ymm4
; AVX1-NEXT: vmovaps (%rsi), %ymm5
; AVX1-NEXT: vmovaps 32(%rsi), %ymm2
@@ -955,23 +953,23 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovaps 32(%rcx), %xmm11
-; AVX1-NEXT: vmovaps 32(%rdx), %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,2],xmm11[1,2]
+; AVX1-NEXT: vmovaps 32(%rdx), %xmm8
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,2],xmm11[1,2]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps 32(%rsi), %xmm1
; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps 32(%rdi), %xmm3
; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX1-NEXT: vbroadcastss 36(%r8), %xmm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vbroadcastss 36(%r9), %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5]
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: vmovaps (%rcx), %ymm9
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm15[0],ymm9[2],ymm15[2]
@@ -1010,18 +1008,18 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX1-NEXT: vbroadcastss 52(%r9), %ymm4
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpckhps {{.*#+}} ymm4 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7]
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,2],ymm9[1,2],ymm15[5,6],ymm9[5,6]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
-; AVX1-NEXT: vbroadcastss 20(%r8), %xmm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vbroadcastss 20(%r9), %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
+; AVX1-NEXT: vbroadcastss 20(%r8), %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vbroadcastss 20(%r9), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
; AVX1-NEXT: vbroadcastss (%rcx), %xmm0
-; AVX1-NEXT: vbroadcastss (%rdx), %xmm5
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; AVX1-NEXT: vbroadcastss (%rdx), %xmm6
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
@@ -1039,58 +1037,58 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3]
; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1
; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7]
; AVX1-NEXT: vmovaps 32(%r9), %xmm3
-; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[0,2,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm3[0,2,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7]
; AVX1-NEXT: vbroadcastss 32(%rcx), %xmm1
-; AVX1-NEXT: vbroadcastss 32(%rdx), %xmm5
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm6
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, 32(%r8), %ymm5, %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
-; AVX1-NEXT: vbroadcastss 32(%r9), %ymm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm9[3,0],ymm15[3,0],ymm9[7,4],ymm15[7,4]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3]
-; AVX1-NEXT: vperm2f128 $51, (%rsp), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX1-NEXT: # ymm5 = mem[2,3,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7]
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX1-NEXT: vmovaps (%r9), %xmm6
-; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5
-; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,1,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm7, %ymm7
-; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[0,2,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
+; AVX1-NEXT: vbroadcastss 32(%rdx), %xmm6
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, 32(%r8), %ymm6, %ymm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7]
+; AVX1-NEXT: vbroadcastss 32(%r9), %ymm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm9[3,0],ymm15[3,0],ymm9[7,4],ymm15[7,4]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm6[2,3]
+; AVX1-NEXT: vperm2f128 $51, (%rsp), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX1-NEXT: # ymm6 = mem[2,3,2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5],ymm6[6,7]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3,2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6],ymm6[7]
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
+; AVX1-NEXT: vmovaps (%r9), %xmm7
+; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm14, %ymm6
+; AVX1-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,1,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm8 = xmm7[0,2,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
+; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7]
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: vmovaps %ymm5, 64(%rax)
-; AVX1-NEXT: vmovaps %ymm4, 160(%rax)
+; AVX1-NEXT: vmovaps %ymm6, 64(%rax)
+; AVX1-NEXT: vmovaps %ymm5, 160(%rax)
; AVX1-NEXT: vmovaps %ymm1, 192(%rax)
; AVX1-NEXT: vmovaps %ymm3, 256(%rax)
; AVX1-NEXT: vmovaps %ymm2, 352(%rax)
; AVX1-NEXT: vmovaps %ymm0, (%rax)
-; AVX1-NEXT: vmovaps %ymm8, 128(%rax)
+; AVX1-NEXT: vmovaps %ymm4, 128(%rax)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm0, 320(%rax)
; AVX1-NEXT: vmovaps %ymm12, 32(%rax)
@@ -1108,127 +1106,127 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: subq $200, %rsp
; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0
-; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10
-; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14
+; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm14[2],xmm5[2],xmm14[3],xmm5[3]
+; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6
; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4
-; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3
+; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7
; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1
-; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7]
-; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm12
-; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm11
-; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm4
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,2,2]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7]
-; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm12[0],zero,xmm12[1],zero
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm4
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm3
-; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm4
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4
+; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10
+; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm12
+; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm12[0],zero,xmm12[1],zero
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm3
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm3
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm2
+; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm3
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm2
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm1
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm15
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm15, %ymm2
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm15, %ymm1
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,2,4,5,6,6]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
-; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm7
-; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7]
-; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm4
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm2
-; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm4
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
+; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,2,4,5,6,6]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,2,3,5,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7
+; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6
+; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5],ymm1[6,7]
+; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastq %xmm11, %ymm2
+; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm2
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm1
+; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm2
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm2
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm6
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm6, %ymm2
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm5
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm5, %ymm2
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2
; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,2,4,5,6,6]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[1,1,2,3,5,5,6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4],ymm4[5],ymm8[6],ymm4[7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,1,2,3]
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm10
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[2,1,2,3]
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9
; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm8
-; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm4[4,5],ymm14[6,7]
+; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3],ymm14[4,5,6,7]
; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm14
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
-; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm14
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[2,1,3,3,6,5,7,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7]
+; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3]
+; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm13
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,1,3,3,6,5,7,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3,4,5],ymm14[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = mem[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6],ymm14[7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm3
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[2,2,3,3]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,3,3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6],ymm5[7]
; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
@@ -1242,7 +1240,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
@@ -1255,7 +1253,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[2,2,3,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,3,3]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5],ymm5[6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,2,3,3]
@@ -1267,7 +1265,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-SLOW-NEXT: vmovdqa %ymm4, 160(%rax)
; AVX2-SLOW-NEXT: vmovdqa %ymm3, 256(%rax)
; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%rax)
-; AVX2-SLOW-NEXT: vmovdqa %ymm13, 352(%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm11, 352(%rax)
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax)
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -1287,142 +1285,142 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-ALL-LABEL: store_i32_stride6_vf16:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: subq $184, %rsp
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm9
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm0
; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %xmm10
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm2
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm8
-; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm7
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %xmm1
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,2,2]
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm1
-; AVX2-FAST-ALL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %xmm11
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,3]
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm11
+; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm5
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %xmm3
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm6
+; AVX2-FAST-ALL-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %xmm8
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm3
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r8), %xmm15
+; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm15[0],zero,xmm15[1],zero
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastd 36(%r9), %ymm4
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,2,2]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm5
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r8), %xmm0
-; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastd 36(%r9), %ymm6
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,2,2]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2],xmm6[3]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
-; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7]
+; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
; AVX2-FAST-ALL-NEXT: vpbroadcastd 4(%r9), %ymm4
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rcx), %xmm3
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rcx), %xmm2
; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rdx), %xmm4
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm13
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %ymm14
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm5, %ymm2
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastd (%r9), %ymm2
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[1,1,2,3,5,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm12
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %ymm13
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm3, %ymm1
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastd (%r9), %ymm1
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,1,2,2,4,5,6,6]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,1,2,3,5,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm4
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %ymm2
-; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7]
-; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastd 52(%r9), %ymm3
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpbroadcastd 32(%rcx), %xmm1
-; AVX2-FAST-ALL-NEXT: vpbroadcastd 32(%rdx), %xmm3
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7]
+; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastd 52(%r9), %ymm1
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vpbroadcastd 32(%rcx), %xmm0
+; AVX2-FAST-ALL-NEXT: vpbroadcastd 32(%rdx), %xmm1
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm15, %ymm1
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FAST-ALL-NEXT: vpbroadcastd 32(%r9), %ymm1
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm7
+; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm15
; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %ymm0
; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,2,4,5,6,6]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[1,1,2,3,5,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,1,2,3,5,5,6,7]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm10 = ymm1[2,1,2,3]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm11 = ymm1[2,1,2,3]
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm5
; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm3
; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5],ymm10[6,7]
-; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpbroadcastd 20(%r9), %ymm15
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm15[3],ymm10[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7]
-; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm15[2,3]
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r8), %ymm15
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = [6,5,3,3,6,5,7,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm15, %ymm9, %ymm12
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5],ymm12[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r9), %ymm12
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,2,3,4,6,6,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm8, %ymm10
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4,5,6],ymm10[7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm1[4,5],ymm11[6,7]
+; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm10 = mem[0],zero,mem[1],zero
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpbroadcastd 20(%r9), %ymm11
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7]
+; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3]
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r8), %ymm14
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm14, %ymm7, %ymm9
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5],ymm9[6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r9), %ymm10
+; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,2,3,4,6,6,7]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm6, %ymm11
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6],ymm11[7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5]
; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
-; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm10[2,2,2,2]
+; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,2,2,2]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7]
; AVX2-FAST-ALL-NEXT: vpbroadcastd 48(%r9), %ymm4
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
-; AVX2-FAST-ALL-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload
-; AVX2-FAST-ALL-NEXT: # xmm4 = xmm11[2],mem[2],xmm11[3],mem[3]
+; AVX2-FAST-ALL-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm4 # 16-byte Folded Reload
+; AVX2-FAST-ALL-NEXT: # xmm4 = xmm8[2],mem[2],xmm8[3],mem[3]
; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm4, %ymm10, %ymm4
-; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3]
-; AVX2-FAST-ALL-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm15, %ymm10, %ymm11
+; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,2,3,3,2,2,3,3]
+; AVX2-FAST-ALL-NEXT: # ymm8 = mem[0,1,0,1]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm14, %ymm8, %ymm11
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3,4,5],ymm11[6,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm10, %ymm11
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4,5,6],ymm11[7]
-; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[6],ymm0[6],ymm7[7],ymm0[7]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7]
-; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm11[2,3]
-; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm11
-; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm9, %ymm9
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7]
-; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %ymm9
-; AVX2-FAST-ALL-NEXT: vpermd %ymm9, %ymm8, %ymm8
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6],ymm8[7]
-; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm8, %ymm10
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3,4,5,6],ymm10[7]
+; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7]
+; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm10[2,3]
+; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm10
+; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm7, %ymm7
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3,4,5],ymm7[6,7]
+; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %ymm7
+; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm6, %ymm6
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5]
; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5],ymm0[6,7]
; AVX2-FAST-ALL-NEXT: vpbroadcastd 16(%r9), %ymm3
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
@@ -1431,9 +1429,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
-; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm10, %ymm5
+; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm8, %ymm5
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm9, %ymm10, %ymm5
+; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm8, %ymm5
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6],ymm5[7]
; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, 64(%rax)
@@ -1441,7 +1439,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 160(%rax)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, 256(%rax)
; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, 288(%rax)
-; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, 352(%rax)
+; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, 352(%rax)
; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 128(%rax)
; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -1462,127 +1460,127 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm10
-; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm5
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14
+; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm14[2],xmm5[2],xmm14[3],xmm5[3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm3
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm12
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm11
-; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm12[0],zero,xmm12[1],zero
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm3
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm4
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm12
+; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm12[0],zero,xmm12[1],zero
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm3
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm3
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm2
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm3
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm12, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm15
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm15, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm15, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,2,4,5,6,6]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm2
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,2,4,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,2,3,5,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6
+; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5],ymm1[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm11, %ymm2
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm2
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm1
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm2
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm12, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm6
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm6, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm5
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm5, %ymm2
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,2,4,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[1,1,2,3,5,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4],ymm4[5],ymm8[6],ymm4[7]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm10
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm4[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm9
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm8
-; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm4[4,5],ymm14[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3],ymm14[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm14
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm14
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[2,1,3,3,6,5,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm13
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,1,3,3,6,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3,4,5],ymm14[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = mem[0,2,2,3,4,6,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6],ymm14[7]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm3
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[2,2,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,3,3]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
@@ -1596,7 +1594,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
-; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
@@ -1609,7 +1607,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[2,2,3,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,3,3]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5],ymm5[6,7]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,2,3,3]
@@ -1621,7 +1619,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 256(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 288(%rax)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 352(%rax)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 352(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax)
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll
index 10bf7e54ab975..ae5dea60451db 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll
@@ -136,9 +136,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; SSE-NEXT: movaps (%rsi), %xmm4
; SSE-NEXT: movaps 16(%rsi), %xmm5
; SSE-NEXT: movaps 32(%rsi), %xmm6
-; SSE-NEXT: movaps 48(%rsi), %xmm8
-; SSE-NEXT: movaps %xmm0, %xmm7
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm4[1]
+; SSE-NEXT: movaps 48(%rsi), %xmm7
+; SSE-NEXT: movaps %xmm0, %xmm8
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1]
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1]
@@ -147,8 +147,8 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1]
; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0]
; SSE-NEXT: movaps %xmm3, %xmm6
-; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm8[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0]
; SSE-NEXT: movaps %xmm3, 96(%rdx)
; SSE-NEXT: movaps %xmm6, 112(%rdx)
; SSE-NEXT: movaps %xmm2, 64(%rdx)
@@ -156,7 +156,7 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; SSE-NEXT: movaps %xmm1, 32(%rdx)
; SSE-NEXT: movaps %xmm4, 48(%rdx)
; SSE-NEXT: movaps %xmm0, (%rdx)
-; SSE-NEXT: movaps %xmm7, 16(%rdx)
+; SSE-NEXT: movaps %xmm8, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i64_stride2_vf8:
@@ -235,64 +235,62 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
; SSE-LABEL: store_i64_stride2_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movaps 112(%rdi), %xmm4
+; SSE-NEXT: movaps 112(%rdi), %xmm0
; SSE-NEXT: movaps 96(%rdi), %xmm6
-; SSE-NEXT: movaps 80(%rdi), %xmm8
-; SSE-NEXT: movaps 64(%rdi), %xmm9
-; SSE-NEXT: movaps (%rdi), %xmm11
-; SSE-NEXT: movaps 16(%rdi), %xmm14
-; SSE-NEXT: movaps 32(%rdi), %xmm15
+; SSE-NEXT: movaps 80(%rdi), %xmm4
+; SSE-NEXT: movaps 64(%rdi), %xmm3
+; SSE-NEXT: movaps (%rdi), %xmm8
+; SSE-NEXT: movaps 16(%rdi), %xmm1
+; SSE-NEXT: movaps 32(%rdi), %xmm2
; SSE-NEXT: movaps 48(%rdi), %xmm5
-; SSE-NEXT: movaps 96(%rsi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 96(%rsi), %xmm11
; SSE-NEXT: movaps 80(%rsi), %xmm12
; SSE-NEXT: movaps 64(%rsi), %xmm13
-; SSE-NEXT: movaps (%rsi), %xmm2
-; SSE-NEXT: movaps 16(%rsi), %xmm1
-; SSE-NEXT: movaps 32(%rsi), %xmm0
-; SSE-NEXT: movaps 48(%rsi), %xmm3
-; SSE-NEXT: movaps %xmm11, %xmm7
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
+; SSE-NEXT: movaps (%rsi), %xmm9
+; SSE-NEXT: movaps 16(%rsi), %xmm10
+; SSE-NEXT: movaps 32(%rsi), %xmm14
+; SSE-NEXT: movaps 48(%rsi), %xmm15
+; SSE-NEXT: movaps %xmm8, %xmm7
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1]
; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0]
-; SSE-NEXT: movaps %xmm14, %xmm10
-; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm1[0]
-; SSE-NEXT: movaps %xmm15, %xmm2
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0]
-; SSE-NEXT: movaps %xmm5, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0]
-; SSE-NEXT: movaps %xmm9, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0]
-; SSE-NEXT: movaps %xmm8, %xmm13
+; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0]
+; SSE-NEXT: movaps %xmm1, %xmm9
+; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0]
+; SSE-NEXT: movaps %xmm2, %xmm10
+; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0]
+; SSE-NEXT: movaps %xmm5, %xmm14
+; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0]
+; SSE-NEXT: movaps %xmm3, %xmm15
+; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0]
+; SSE-NEXT: movaps %xmm4, %xmm13
; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm12[0]
-; SSE-NEXT: movaps %xmm6, %xmm3
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; SSE-NEXT: movaps 112(%rsi), %xmm12
-; SSE-NEXT: movaps %xmm4, %xmm7
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1]
; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0]
-; SSE-NEXT: movaps %xmm4, 224(%rdx)
+; SSE-NEXT: movaps %xmm6, %xmm12
+; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0]
+; SSE-NEXT: movaps 112(%rsi), %xmm11
+; SSE-NEXT: movaps %xmm0, %xmm7
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0]
+; SSE-NEXT: movaps %xmm0, 224(%rdx)
; SSE-NEXT: movaps %xmm7, 240(%rdx)
; SSE-NEXT: movaps %xmm6, 192(%rdx)
-; SSE-NEXT: movaps %xmm3, 208(%rdx)
-; SSE-NEXT: movaps %xmm8, 160(%rdx)
+; SSE-NEXT: movaps %xmm12, 208(%rdx)
+; SSE-NEXT: movaps %xmm4, 160(%rdx)
; SSE-NEXT: movaps %xmm13, 176(%rdx)
-; SSE-NEXT: movaps %xmm9, 128(%rdx)
-; SSE-NEXT: movaps %xmm1, 144(%rdx)
+; SSE-NEXT: movaps %xmm3, 128(%rdx)
+; SSE-NEXT: movaps %xmm15, 144(%rdx)
; SSE-NEXT: movaps %xmm5, 96(%rdx)
-; SSE-NEXT: movaps %xmm0, 112(%rdx)
-; SSE-NEXT: movaps %xmm15, 64(%rdx)
-; SSE-NEXT: movaps %xmm2, 80(%rdx)
-; SSE-NEXT: movaps %xmm14, 32(%rdx)
-; SSE-NEXT: movaps %xmm10, 48(%rdx)
-; SSE-NEXT: movaps %xmm11, (%rdx)
+; SSE-NEXT: movaps %xmm14, 112(%rdx)
+; SSE-NEXT: movaps %xmm2, 64(%rdx)
+; SSE-NEXT: movaps %xmm10, 80(%rdx)
+; SSE-NEXT: movaps %xmm1, 32(%rdx)
+; SSE-NEXT: movaps %xmm9, 48(%rdx)
+; SSE-NEXT: movaps %xmm8, (%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 16(%rdx)
; SSE-NEXT: retq
@@ -300,25 +298,25 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX1-LABEL: store_i64_stride2_vf16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps (%rsi), %xmm0
-; AVX1-NEXT: vmovaps 32(%rsi), %xmm8
+; AVX1-NEXT: vmovaps 32(%rsi), %xmm1
; AVX1-NEXT: vmovaps 64(%rsi), %xmm2
; AVX1-NEXT: vmovaps 96(%rsi), %xmm3
; AVX1-NEXT: vmovaps (%rdi), %xmm4
; AVX1-NEXT: vmovaps 32(%rdi), %xmm5
; AVX1-NEXT: vmovaps 64(%rdi), %xmm6
; AVX1-NEXT: vmovaps 96(%rdi), %xmm7
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm3[1]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm3[1]
; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm2[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm2[1]
; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm0[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm0[1]
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm8[1]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm8[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm1[1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3]
; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[3]
@@ -335,10 +333,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX1-NEXT: vmovapd %ymm6, 96(%rdx)
; AVX1-NEXT: vmovapd %ymm5, 32(%rdx)
; AVX1-NEXT: vmovapd %ymm4, 160(%rdx)
-; AVX1-NEXT: vmovaps %ymm3, 64(%rdx)
+; AVX1-NEXT: vmovaps %ymm1, 64(%rdx)
; AVX1-NEXT: vmovapd %ymm0, (%rdx)
; AVX1-NEXT: vmovaps %ymm2, 128(%rdx)
-; AVX1-NEXT: vmovaps %ymm1, 192(%rdx)
+; AVX1-NEXT: vmovaps %ymm3, 192(%rdx)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -423,53 +421,53 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; SSE-NEXT: subq $152, %rsp
; SSE-NEXT: movaps 112(%rdi), %xmm14
; SSE-NEXT: movaps 96(%rdi), %xmm13
-; SSE-NEXT: movaps 80(%rdi), %xmm11
-; SSE-NEXT: movaps 64(%rdi), %xmm10
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: movaps 16(%rdi), %xmm8
-; SSE-NEXT: movaps 32(%rdi), %xmm9
-; SSE-NEXT: movaps 48(%rdi), %xmm12
-; SSE-NEXT: movaps 96(%rsi), %xmm15
+; SSE-NEXT: movaps 80(%rdi), %xmm10
+; SSE-NEXT: movaps 64(%rdi), %xmm9
+; SSE-NEXT: movaps (%rdi), %xmm6
+; SSE-NEXT: movaps 16(%rdi), %xmm7
+; SSE-NEXT: movaps 32(%rdi), %xmm8
+; SSE-NEXT: movaps 48(%rdi), %xmm11
+; SSE-NEXT: movaps 96(%rsi), %xmm0
; SSE-NEXT: movaps 80(%rsi), %xmm1
; SSE-NEXT: movaps 64(%rsi), %xmm2
; SSE-NEXT: movaps (%rsi), %xmm3
; SSE-NEXT: movaps 16(%rsi), %xmm4
; SSE-NEXT: movaps 32(%rsi), %xmm5
-; SSE-NEXT: movaps 48(%rsi), %xmm6
-; SSE-NEXT: movaps %xmm0, %xmm7
-; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0]
+; SSE-NEXT: movaps 48(%rsi), %xmm12
+; SSE-NEXT: movaps %xmm6, %xmm15
+; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
+; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm7, %xmm6
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0]
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm4[1]
; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm8, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1]
+; SSE-NEXT: movaps %xmm8, %xmm4
+; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1]
; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm9, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm5[1]
+; SSE-NEXT: movaps %xmm11, %xmm4
+; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0]
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm9, %xmm3
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1]
; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm12, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1]
-; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm10, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1]
+; SSE-NEXT: movaps %xmm10, %xmm2
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1]
; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm11, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1]
-; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm13, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm15[1]
+; SSE-NEXT: movaps %xmm13, %xmm1
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 112(%rsi), %xmm0
; SSE-NEXT: movaps %xmm14, %xmm1
@@ -577,48 +575,48 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vmovaps 128(%rsi), %xmm1
; AVX1-NEXT: vmovaps 128(%rdi), %xmm2
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1]
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm9
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vmovaps (%rsi), %xmm2
; AVX1-NEXT: vmovaps 32(%rsi), %xmm3
; AVX1-NEXT: vmovaps 64(%rsi), %xmm4
-; AVX1-NEXT: vmovaps 96(%rsi), %xmm10
+; AVX1-NEXT: vmovaps 96(%rsi), %xmm5
; AVX1-NEXT: vmovaps (%rdi), %xmm6
; AVX1-NEXT: vmovaps 32(%rdi), %xmm7
-; AVX1-NEXT: vmovaps 64(%rdi), %xmm0
-; AVX1-NEXT: vmovaps 96(%rdi), %xmm1
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm2[1]
+; AVX1-NEXT: vmovaps 64(%rdi), %xmm8
+; AVX1-NEXT: vmovaps 96(%rdi), %xmm9
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm6[1],xmm2[1]
; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm3[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm3[1]
; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm4[1]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm4
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm10[1]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm5
-; AVX1-NEXT: vmovaps 160(%rsi), %xmm0
-; AVX1-NEXT: vmovaps 160(%rdi), %xmm1
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm1[1],xmm0[1]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX1-NEXT: vmovapd 192(%rsi), %xmm0
-; AVX1-NEXT: vmovapd 192(%rdi), %xmm1
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm1[1],xmm0[1]
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
-; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm4[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm5[1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: vmovaps 160(%rsi), %xmm6
+; AVX1-NEXT: vmovaps 160(%rdi), %xmm7
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
+; AVX1-NEXT: vmovaps 192(%rsi), %xmm7
+; AVX1-NEXT: vmovaps 192(%rdi), %xmm8
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[3],ymm8[3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3]
-; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[3],ymm1[3]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[3],ymm9[3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3]
; AVX1-NEXT: vshufpd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[3],ymm10[3]
@@ -635,23 +633,23 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3]
; AVX1-NEXT: vshufpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[3],ymm14[3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
-; AVX1-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm15[0],ymm8[3],ymm15[3]
-; AVX1-NEXT: vmovapd %ymm8, 480(%rdx)
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[3],ymm15[3]
+; AVX1-NEXT: vmovapd %ymm0, 480(%rdx)
; AVX1-NEXT: vmovapd %ymm14, 416(%rdx)
; AVX1-NEXT: vmovapd %ymm13, 352(%rdx)
; AVX1-NEXT: vmovapd %ymm12, 224(%rdx)
; AVX1-NEXT: vmovapd %ymm11, 160(%rdx)
; AVX1-NEXT: vmovapd %ymm10, 96(%rdx)
-; AVX1-NEXT: vmovapd %ymm1, 32(%rdx)
-; AVX1-NEXT: vmovapd %ymm0, 288(%rdx)
+; AVX1-NEXT: vmovapd %ymm9, 32(%rdx)
+; AVX1-NEXT: vmovapd %ymm8, 288(%rdx)
; AVX1-NEXT: vmovaps %ymm7, 384(%rdx)
; AVX1-NEXT: vmovaps %ymm6, 320(%rdx)
; AVX1-NEXT: vmovaps %ymm5, 192(%rdx)
; AVX1-NEXT: vmovaps %ymm4, 128(%rdx)
; AVX1-NEXT: vmovaps %ymm3, 64(%rdx)
; AVX1-NEXT: vmovaps %ymm2, (%rdx)
-; AVX1-NEXT: vmovaps %ymm9, 256(%rdx)
+; AVX1-NEXT: vmovaps %ymm1, 256(%rdx)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm0, 448(%rdx)
; AVX1-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
index 45af9f9f72667..5433f44b90d8a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
@@ -181,44 +181,44 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm3
; SSE-NEXT: movaps 16(%rdi), %xmm2
-; SSE-NEXT: movaps 32(%rdi), %xmm13
-; SSE-NEXT: movaps 48(%rdi), %xmm12
-; SSE-NEXT: movaps (%rsi), %xmm8
-; SSE-NEXT: movaps 16(%rsi), %xmm9
-; SSE-NEXT: movaps 32(%rsi), %xmm11
-; SSE-NEXT: movaps 48(%rsi), %xmm4
-; SSE-NEXT: movaps (%rdx), %xmm7
-; SSE-NEXT: movaps 16(%rdx), %xmm0
+; SSE-NEXT: movaps 32(%rdi), %xmm1
+; SSE-NEXT: movaps 48(%rdi), %xmm0
+; SSE-NEXT: movaps (%rsi), %xmm7
+; SSE-NEXT: movaps 16(%rsi), %xmm8
+; SSE-NEXT: movaps 32(%rsi), %xmm9
+; SSE-NEXT: movaps 48(%rsi), %xmm10
+; SSE-NEXT: movaps (%rdx), %xmm11
+; SSE-NEXT: movaps 16(%rdx), %xmm12
; SSE-NEXT: movaps 32(%rdx), %xmm6
; SSE-NEXT: movaps 48(%rdx), %xmm5
-; SSE-NEXT: movaps %xmm4, %xmm10
-; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3]
-; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm4[0]
-; SSE-NEXT: movaps %xmm11, %xmm14
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1]
-; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,3]
-; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0]
-; SSE-NEXT: movaps %xmm9, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0]
-; SSE-NEXT: movaps %xmm8, %xmm4
-; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1]
-; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3]
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm8[0]
+; SSE-NEXT: movaps %xmm10, %xmm4
+; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0]
+; SSE-NEXT: movaps %xmm9, %xmm10
+; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1]
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3]
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0]
+; SSE-NEXT: movaps %xmm8, %xmm9
+; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1]
+; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[2,3]
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm8[0]
+; SSE-NEXT: movaps %xmm7, %xmm8
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1]
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm3[2,3]
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0]
; SSE-NEXT: movaps %xmm3, (%rcx)
-; SSE-NEXT: movaps %xmm7, 16(%rcx)
-; SSE-NEXT: movaps %xmm4, 32(%rcx)
+; SSE-NEXT: movaps %xmm11, 16(%rcx)
+; SSE-NEXT: movaps %xmm8, 32(%rcx)
; SSE-NEXT: movaps %xmm2, 48(%rcx)
-; SSE-NEXT: movaps %xmm0, 64(%rcx)
-; SSE-NEXT: movaps %xmm1, 80(%rcx)
-; SSE-NEXT: movaps %xmm13, 96(%rcx)
+; SSE-NEXT: movaps %xmm12, 64(%rcx)
+; SSE-NEXT: movaps %xmm9, 80(%rcx)
+; SSE-NEXT: movaps %xmm1, 96(%rcx)
; SSE-NEXT: movaps %xmm6, 112(%rcx)
-; SSE-NEXT: movaps %xmm14, 128(%rcx)
-; SSE-NEXT: movaps %xmm12, 144(%rcx)
+; SSE-NEXT: movaps %xmm10, 128(%rcx)
+; SSE-NEXT: movaps %xmm0, 144(%rcx)
; SSE-NEXT: movaps %xmm5, 160(%rcx)
-; SSE-NEXT: movaps %xmm10, 176(%rcx)
+; SSE-NEXT: movaps %xmm4, 176(%rcx)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i64_stride3_vf8:
@@ -343,98 +343,99 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-LABEL: store_i64_stride3_vf16:
; SSE: # %bb.0:
; SSE-NEXT: subq $24, %rsp
-; SSE-NEXT: movapd 64(%rdi), %xmm9
-; SSE-NEXT: movapd (%rdi), %xmm3
-; SSE-NEXT: movapd 16(%rdi), %xmm13
-; SSE-NEXT: movapd 32(%rdi), %xmm8
-; SSE-NEXT: movapd 48(%rdi), %xmm10
-; SSE-NEXT: movapd 64(%rsi), %xmm12
-; SSE-NEXT: movapd (%rsi), %xmm7
-; SSE-NEXT: movapd 16(%rsi), %xmm14
-; SSE-NEXT: movapd 32(%rsi), %xmm15
-; SSE-NEXT: movapd 48(%rsi), %xmm11
-; SSE-NEXT: movapd 64(%rdx), %xmm6
-; SSE-NEXT: movapd (%rdx), %xmm2
-; SSE-NEXT: movapd 16(%rdx), %xmm4
-; SSE-NEXT: movapd 32(%rdx), %xmm5
-; SSE-NEXT: movapd 48(%rdx), %xmm0
-; SSE-NEXT: movapd %xmm3, %xmm1
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm7[0]
-; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill
-; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
+; SSE-NEXT: movapd 64(%rdi), %xmm4
+; SSE-NEXT: movapd (%rdi), %xmm0
+; SSE-NEXT: movapd 16(%rdi), %xmm1
+; SSE-NEXT: movapd 32(%rdi), %xmm2
+; SSE-NEXT: movapd 48(%rdi), %xmm5
+; SSE-NEXT: movapd 64(%rsi), %xmm9
+; SSE-NEXT: movapd (%rsi), %xmm3
+; SSE-NEXT: movapd 16(%rsi), %xmm6
+; SSE-NEXT: movapd 32(%rsi), %xmm7
+; SSE-NEXT: movapd 48(%rsi), %xmm10
+; SSE-NEXT: movapd 64(%rdx), %xmm15
+; SSE-NEXT: movapd (%rdx), %xmm11
+; SSE-NEXT: movapd 16(%rdx), %xmm12
+; SSE-NEXT: movapd 32(%rdx), %xmm13
+; SSE-NEXT: movapd 48(%rdx), %xmm14
+; SSE-NEXT: movapd %xmm0, %xmm8
+; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm3[0]
+; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1]
+; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1]
; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm1, %xmm11
+; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm6[0]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1]
+; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1]
+; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movapd %xmm2, %xmm12
+; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm7[0]
+; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1]
+; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1]
; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movapd %xmm13, %xmm3
-; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm14[0]
-; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm4[0],xmm13[1]
-; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1]
-; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movapd %xmm8, %xmm13
-; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm15[0]
-; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1]
-; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm5[1]
-; SSE-NEXT: movapd %xmm10, %xmm1
-; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm11[0]
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
-; SSE-NEXT: movapd %xmm9, %xmm14
-; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm12[0]
-; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1]
-; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1]
-; SSE-NEXT: movapd 80(%rdi), %xmm8
+; SSE-NEXT: movapd %xmm5, %xmm13
+; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm10[0]
+; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm14[0],xmm5[1]
+; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1]
+; SSE-NEXT: movapd %xmm4, %xmm14
+; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm9[0]
+; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm15[0],xmm4[1]
+; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1]
+; SSE-NEXT: movapd 80(%rdi), %xmm15
; SSE-NEXT: movapd 80(%rsi), %xmm6
-; SSE-NEXT: movapd %xmm8, %xmm9
-; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0]
+; SSE-NEXT: movapd %xmm15, %xmm8
+; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm6[0]
; SSE-NEXT: movapd 80(%rdx), %xmm0
-; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1]
+; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
-; SSE-NEXT: movapd 96(%rdi), %xmm5
+; SSE-NEXT: movapd 96(%rdi), %xmm4
; SSE-NEXT: movapd 96(%rsi), %xmm1
-; SSE-NEXT: movapd %xmm5, %xmm7
+; SSE-NEXT: movapd %xmm4, %xmm7
; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0]
; SSE-NEXT: movapd 96(%rdx), %xmm2
-; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1]
+; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; SSE-NEXT: movapd 112(%rdi), %xmm2
; SSE-NEXT: movapd 112(%rsi), %xmm0
; SSE-NEXT: movapd %xmm2, %xmm3
; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE-NEXT: movapd 112(%rdx), %xmm4
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE-NEXT: movapd 112(%rdx), %xmm5
+; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1]
; SSE-NEXT: movapd %xmm0, 368(%rcx)
; SSE-NEXT: movapd %xmm2, 352(%rcx)
; SSE-NEXT: movapd %xmm3, 336(%rcx)
; SSE-NEXT: movapd %xmm1, 320(%rcx)
-; SSE-NEXT: movapd %xmm5, 304(%rcx)
+; SSE-NEXT: movapd %xmm4, 304(%rcx)
; SSE-NEXT: movapd %xmm7, 288(%rcx)
; SSE-NEXT: movapd %xmm6, 272(%rcx)
-; SSE-NEXT: movapd %xmm8, 256(%rcx)
-; SSE-NEXT: movapd %xmm9, 240(%rcx)
-; SSE-NEXT: movapd %xmm12, 224(%rcx)
+; SSE-NEXT: movapd %xmm15, 256(%rcx)
+; SSE-NEXT: movapd %xmm8, 240(%rcx)
+; SSE-NEXT: movapd %xmm9, 224(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 208(%rcx)
; SSE-NEXT: movapd %xmm14, 192(%rcx)
-; SSE-NEXT: movapd %xmm11, 176(%rcx)
+; SSE-NEXT: movapd %xmm10, 176(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 160(%rcx)
-; SSE-NEXT: movapd %xmm10, 144(%rcx)
-; SSE-NEXT: movapd %xmm15, 128(%rcx)
+; SSE-NEXT: movapd %xmm13, 144(%rcx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 128(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 112(%rcx)
-; SSE-NEXT: movapd %xmm13, 96(%rcx)
+; SSE-NEXT: movapd %xmm12, 96(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 80(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 64(%rcx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 48(%rcx)
+; SSE-NEXT: movapd %xmm11, 48(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 32(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
index f910adc3d93ac..af6659568687e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
@@ -88,31 +88,31 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps (%rsi), %xmm2
-; SSE-NEXT: movaps 16(%rsi), %xmm8
+; SSE-NEXT: movaps 16(%rsi), %xmm3
; SSE-NEXT: movaps (%rdx), %xmm4
; SSE-NEXT: movaps 16(%rdx), %xmm5
; SSE-NEXT: movaps (%rcx), %xmm6
-; SSE-NEXT: movaps 16(%rcx), %xmm9
-; SSE-NEXT: movaps %xmm4, %xmm3
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
-; SSE-NEXT: movaps %xmm0, %xmm7
-; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0]
+; SSE-NEXT: movaps 16(%rcx), %xmm7
+; SSE-NEXT: movaps %xmm4, %xmm8
+; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0]
+; SSE-NEXT: movaps %xmm0, %xmm9
+; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
; SSE-NEXT: movaps %xmm5, %xmm2
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0]
; SSE-NEXT: movaps %xmm1, %xmm6
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
; SSE-NEXT: movaps %xmm1, 96(%r8)
; SSE-NEXT: movaps %xmm5, 112(%r8)
; SSE-NEXT: movaps %xmm6, 64(%r8)
; SSE-NEXT: movaps %xmm2, 80(%r8)
; SSE-NEXT: movaps %xmm0, 32(%r8)
; SSE-NEXT: movaps %xmm4, 48(%r8)
-; SSE-NEXT: movaps %xmm7, (%r8)
-; SSE-NEXT: movaps %xmm3, 16(%r8)
+; SSE-NEXT: movaps %xmm9, (%r8)
+; SSE-NEXT: movaps %xmm8, 16(%r8)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i64_stride4_vf4:
@@ -193,65 +193,62 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
; SSE-LABEL: store_i64_stride4_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm8
-; SSE-NEXT: movaps 16(%rdi), %xmm13
-; SSE-NEXT: movaps 32(%rdi), %xmm9
+; SSE-NEXT: movaps (%rdi), %xmm5
+; SSE-NEXT: movaps 16(%rdi), %xmm1
+; SSE-NEXT: movaps 32(%rdi), %xmm3
; SSE-NEXT: movaps 48(%rdi), %xmm0
-; SSE-NEXT: movaps (%rsi), %xmm6
+; SSE-NEXT: movaps (%rsi), %xmm10
; SSE-NEXT: movaps 16(%rsi), %xmm12
-; SSE-NEXT: movaps 32(%rsi), %xmm1
-; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps (%rdx), %xmm14
+; SSE-NEXT: movaps 32(%rsi), %xmm11
+; SSE-NEXT: movaps (%rdx), %xmm2
; SSE-NEXT: movaps 16(%rdx), %xmm4
; SSE-NEXT: movaps 32(%rdx), %xmm7
-; SSE-NEXT: movaps 48(%rdx), %xmm5
-; SSE-NEXT: movaps (%rcx), %xmm1
-; SSE-NEXT: movaps 16(%rcx), %xmm2
-; SSE-NEXT: movaps 32(%rcx), %xmm15
-; SSE-NEXT: movaps 48(%rcx), %xmm11
-; SSE-NEXT: movaps %xmm14, %xmm3
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1]
-; SSE-NEXT: movaps %xmm8, %xmm3
-; SSE-NEXT: movaps %xmm8, %xmm10
-; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm6[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1]
-; SSE-NEXT: movaps %xmm4, %xmm8
-; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
-; SSE-NEXT: movaps %xmm13, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1]
+; SSE-NEXT: movaps 48(%rdx), %xmm9
+; SSE-NEXT: movaps (%rcx), %xmm8
+; SSE-NEXT: movaps 16(%rcx), %xmm13
+; SSE-NEXT: movaps 32(%rcx), %xmm14
+; SSE-NEXT: movaps 48(%rcx), %xmm15
+; SSE-NEXT: movaps %xmm2, %xmm6
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0]
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1]
+; SSE-NEXT: movaps %xmm5, %xmm8
+; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm10[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1]
+; SSE-NEXT: movaps %xmm4, %xmm10
+; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1]
+; SSE-NEXT: movaps %xmm1, %xmm13
+; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1]
; SSE-NEXT: movaps %xmm7, %xmm12
-; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1]
-; SSE-NEXT: movaps %xmm9, %xmm15
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1]
-; SSE-NEXT: movaps %xmm5, %xmm2
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1]
-; SSE-NEXT: movaps 48(%rsi), %xmm11
+; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm14[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1]
+; SSE-NEXT: movaps %xmm3, %xmm14
+; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1]
+; SSE-NEXT: movaps %xmm9, %xmm11
+; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1]
+; SSE-NEXT: movaps 48(%rsi), %xmm15
; SSE-NEXT: movaps %xmm0, %xmm6
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1]
; SSE-NEXT: movaps %xmm0, 224(%r8)
-; SSE-NEXT: movaps %xmm5, 240(%r8)
+; SSE-NEXT: movaps %xmm9, 240(%r8)
; SSE-NEXT: movaps %xmm6, 192(%r8)
-; SSE-NEXT: movaps %xmm2, 208(%r8)
-; SSE-NEXT: movaps %xmm9, 160(%r8)
+; SSE-NEXT: movaps %xmm11, 208(%r8)
+; SSE-NEXT: movaps %xmm3, 160(%r8)
; SSE-NEXT: movaps %xmm7, 176(%r8)
-; SSE-NEXT: movaps %xmm15, 128(%r8)
+; SSE-NEXT: movaps %xmm14, 128(%r8)
; SSE-NEXT: movaps %xmm12, 144(%r8)
-; SSE-NEXT: movaps %xmm13, 96(%r8)
+; SSE-NEXT: movaps %xmm1, 96(%r8)
; SSE-NEXT: movaps %xmm4, 112(%r8)
-; SSE-NEXT: movaps %xmm1, 64(%r8)
-; SSE-NEXT: movaps %xmm8, 80(%r8)
-; SSE-NEXT: movaps %xmm3, 32(%r8)
-; SSE-NEXT: movaps %xmm14, 48(%r8)
-; SSE-NEXT: movaps %xmm10, (%r8)
+; SSE-NEXT: movaps %xmm13, 64(%r8)
+; SSE-NEXT: movaps %xmm10, 80(%r8)
+; SSE-NEXT: movaps %xmm5, 32(%r8)
+; SSE-NEXT: movaps %xmm2, 48(%r8)
+; SSE-NEXT: movaps %xmm8, (%r8)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 16(%r8)
; SSE-NEXT: retq
@@ -266,46 +263,46 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-NEXT: vmovaps 48(%rsi), %xmm5
; AVX1-NEXT: vmovaps 48(%rdi), %xmm6
; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm5[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX1-NEXT: vmovaps 16(%rsi), %xmm5
; AVX1-NEXT: vmovaps 16(%rdi), %xmm6
; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm4[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm5[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: vmovaps 32(%rsi), %xmm3
; AVX1-NEXT: vmovaps 32(%rdi), %xmm5
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm5[0],xmm3[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm3[0]
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1]
; AVX1-NEXT: vmovaps (%rsi), %xmm5
; AVX1-NEXT: vmovaps (%rdi), %xmm7
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm5[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm5[0]
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1]
; AVX1-NEXT: vmovaps 32(%rcx), %xmm7
-; AVX1-NEXT: vmovaps 32(%rdx), %xmm1
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm7[0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1]
-; AVX1-NEXT: vmovaps (%rcx), %xmm7
-; AVX1-NEXT: vmovaps (%rdx), %xmm2
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm7[0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1]
-; AVX1-NEXT: vmovaps %xmm2, 48(%r8)
-; AVX1-NEXT: vmovaps %xmm6, 16(%r8)
-; AVX1-NEXT: vmovaps %xmm1, 176(%r8)
-; AVX1-NEXT: vmovaps %xmm4, 144(%r8)
+; AVX1-NEXT: vmovaps 32(%rdx), %xmm9
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm7[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm7[1]
+; AVX1-NEXT: vmovaps (%rcx), %xmm9
+; AVX1-NEXT: vmovaps (%rdx), %xmm11
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm9[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm9[1]
+; AVX1-NEXT: vmovaps %xmm9, 48(%r8)
+; AVX1-NEXT: vmovaps %xmm12, 16(%r8)
+; AVX1-NEXT: vmovaps %xmm7, 176(%r8)
+; AVX1-NEXT: vmovaps %xmm10, 144(%r8)
; AVX1-NEXT: vmovaps %xmm5, 32(%r8)
-; AVX1-NEXT: vmovaps %xmm0, (%r8)
+; AVX1-NEXT: vmovaps %xmm8, (%r8)
; AVX1-NEXT: vmovaps %xmm3, 160(%r8)
-; AVX1-NEXT: vmovaps %xmm12, 128(%r8)
-; AVX1-NEXT: vmovaps %ymm11, 96(%r8)
-; AVX1-NEXT: vmovaps %ymm10, 64(%r8)
-; AVX1-NEXT: vmovaps %ymm9, 224(%r8)
-; AVX1-NEXT: vmovaps %ymm8, 192(%r8)
+; AVX1-NEXT: vmovaps %xmm6, 128(%r8)
+; AVX1-NEXT: vmovaps %ymm2, 96(%r8)
+; AVX1-NEXT: vmovaps %ymm4, 64(%r8)
+; AVX1-NEXT: vmovaps %ymm1, 224(%r8)
+; AVX1-NEXT: vmovaps %ymm0, 192(%r8)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -321,44 +318,44 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovaps 32(%rcx), %ymm8
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3]
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm6[2,3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3]
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm6[2,3],ymm4[2,3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3]
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3]
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3],ymm5[2,3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3]
; AVX2-NEXT: vmovaps (%rsi), %xmm3
; AVX2-NEXT: vmovaps 32(%rsi), %xmm5
; AVX2-NEXT: vmovaps (%rdi), %xmm6
; AVX2-NEXT: vmovaps 32(%rdi), %xmm7
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm12 = xmm7[0],xmm5[0]
-; AVX2-NEXT: vmovaps (%rcx), %xmm1
-; AVX2-NEXT: vmovaps 32(%rcx), %xmm4
-; AVX2-NEXT: vmovaps (%rdx), %xmm2
-; AVX2-NEXT: vmovaps 32(%rdx), %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm4[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm5[0]
+; AVX2-NEXT: vmovaps (%rcx), %xmm9
+; AVX2-NEXT: vmovaps 32(%rcx), %xmm10
+; AVX2-NEXT: vmovaps (%rdx), %xmm11
+; AVX2-NEXT: vmovaps 32(%rdx), %xmm12
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm10[0]
; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm6[0],xmm3[0]
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm12[1],xmm10[1]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm6[0],xmm3[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm9[0]
; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX2-NEXT: vmovaps %xmm1, 48(%r8)
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm11[1],xmm9[1]
+; AVX2-NEXT: vmovaps %xmm6, 48(%r8)
; AVX2-NEXT: vmovaps %xmm3, 32(%r8)
-; AVX2-NEXT: vmovaps %xmm7, 16(%r8)
-; AVX2-NEXT: vmovaps %xmm4, (%r8)
-; AVX2-NEXT: vmovaps %xmm0, 176(%r8)
+; AVX2-NEXT: vmovaps %xmm12, 16(%r8)
+; AVX2-NEXT: vmovaps %xmm10, (%r8)
+; AVX2-NEXT: vmovaps %xmm7, 176(%r8)
; AVX2-NEXT: vmovaps %xmm5, 160(%r8)
; AVX2-NEXT: vmovaps %xmm13, 144(%r8)
-; AVX2-NEXT: vmovaps %xmm12, 128(%r8)
-; AVX2-NEXT: vmovaps %ymm11, 96(%r8)
-; AVX2-NEXT: vmovaps %ymm10, 64(%r8)
-; AVX2-NEXT: vmovaps %ymm8, 224(%r8)
-; AVX2-NEXT: vmovaps %ymm9, 192(%r8)
+; AVX2-NEXT: vmovaps %xmm8, 128(%r8)
+; AVX2-NEXT: vmovaps %ymm2, 96(%r8)
+; AVX2-NEXT: vmovaps %ymm4, 64(%r8)
+; AVX2-NEXT: vmovaps %ymm1, 224(%r8)
+; AVX2-NEXT: vmovaps %ymm0, 192(%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -415,62 +412,62 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-LABEL: store_i64_stride4_vf16:
; SSE: # %bb.0:
; SSE-NEXT: subq $152, %rsp
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: movaps 16(%rdi), %xmm8
-; SSE-NEXT: movaps 32(%rdi), %xmm9
-; SSE-NEXT: movaps 48(%rdi), %xmm10
+; SSE-NEXT: movaps (%rdi), %xmm6
+; SSE-NEXT: movaps 16(%rdi), %xmm7
+; SSE-NEXT: movaps 32(%rdi), %xmm8
+; SSE-NEXT: movaps 48(%rdi), %xmm9
; SSE-NEXT: movaps (%rsi), %xmm1
; SSE-NEXT: movaps 16(%rsi), %xmm2
-; SSE-NEXT: movaps 32(%rsi), %xmm13
-; SSE-NEXT: movaps 48(%rsi), %xmm11
-; SSE-NEXT: movaps (%rdx), %xmm6
-; SSE-NEXT: movaps 16(%rdx), %xmm12
-; SSE-NEXT: movaps 32(%rdx), %xmm14
-; SSE-NEXT: movaps 48(%rdx), %xmm15
+; SSE-NEXT: movaps 32(%rsi), %xmm0
+; SSE-NEXT: movaps 48(%rsi), %xmm15
+; SSE-NEXT: movaps (%rdx), %xmm10
+; SSE-NEXT: movaps 16(%rdx), %xmm11
+; SSE-NEXT: movaps 32(%rdx), %xmm13
+; SSE-NEXT: movaps 48(%rdx), %xmm14
; SSE-NEXT: movaps (%rcx), %xmm3
; SSE-NEXT: movaps 16(%rcx), %xmm4
; SSE-NEXT: movaps 32(%rcx), %xmm5
-; SSE-NEXT: movaps %xmm0, %xmm7
-; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0]
-; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm6, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
+; SSE-NEXT: movaps %xmm6, %xmm12
+; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0]
+; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1]
; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm8, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1]
+; SSE-NEXT: movaps %xmm10, %xmm6
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0]
+; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1]
+; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm7, %xmm1
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
+; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm11, %xmm2
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1]
+; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm8, %xmm1
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm12, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1]
-; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm9, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0]
-; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm13[1]
-; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm14, %xmm1
+; SSE-NEXT: movaps %xmm13, %xmm1
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1]
-; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm10, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1]
+; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm9, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1]
-; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1]
+; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 48(%rcx), %xmm0
-; SSE-NEXT: movaps %xmm15, %xmm1
+; SSE-NEXT: movaps %xmm14, %xmm1
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
-; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1]
+; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 64(%rdi), %xmm15
; SSE-NEXT: movaps 64(%rsi), %xmm1
; SSE-NEXT: movaps %xmm15, %xmm0
@@ -567,87 +564,87 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX1-LABEL: store_i64_stride4_vf16:
; AVX1: # %bb.0:
; AVX1-NEXT: subq $152, %rsp
-; AVX1-NEXT: vmovaps 96(%rdx), %ymm12
-; AVX1-NEXT: vmovaps 64(%rdx), %ymm1
-; AVX1-NEXT: vmovaps 32(%rdx), %ymm2
-; AVX1-NEXT: vmovaps (%rdx), %ymm3
-; AVX1-NEXT: vmovaps 96(%rcx), %ymm13
-; AVX1-NEXT: vmovaps 64(%rcx), %ymm5
+; AVX1-NEXT: vmovaps 96(%rdx), %ymm7
+; AVX1-NEXT: vmovaps 64(%rdx), %ymm5
+; AVX1-NEXT: vmovaps 32(%rdx), %ymm3
+; AVX1-NEXT: vmovaps (%rdx), %ymm1
+; AVX1-NEXT: vmovaps 96(%rcx), %ymm8
+; AVX1-NEXT: vmovaps 64(%rcx), %ymm6
; AVX1-NEXT: vmovaps 32(%rcx), %ymm4
-; AVX1-NEXT: vmovaps (%rcx), %ymm6
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm3[1],ymm6[1],ymm3[3],ymm6[3]
-; AVX1-NEXT: vmovaps 16(%rsi), %xmm0
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm7
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm7[1],xmm0[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX1-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm0[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: vmovaps (%rcx), %ymm2
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
+; AVX1-NEXT: vmovaps 16(%rsi), %xmm9
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm10
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm4[1],ymm2[3],ymm4[3]
-; AVX1-NEXT: vmovaps 48(%rsi), %xmm3
-; AVX1-NEXT: vmovaps 48(%rdi), %xmm6
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm3[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm10[0],xmm9[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
+; AVX1-NEXT: vmovaps 48(%rsi), %xmm9
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm10
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm9[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm5[1],ymm1[3],ymm5[3]
-; AVX1-NEXT: vmovaps 80(%rsi), %xmm2
-; AVX1-NEXT: vmovaps 80(%rdi), %xmm3
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm2[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
+; AVX1-NEXT: vmovaps 80(%rsi), %xmm9
+; AVX1-NEXT: vmovaps 80(%rdi), %xmm10
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm4[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm10[0],xmm9[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
-; AVX1-NEXT: vmovaps 112(%rsi), %xmm1
-; AVX1-NEXT: vmovaps 112(%rdi), %xmm2
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
+; AVX1-NEXT: vmovaps 112(%rsi), %xmm9
+; AVX1-NEXT: vmovaps 112(%rdi), %xmm10
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm6[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[2],ymm13[2]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm9[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovaps 64(%rsi), %xmm2
-; AVX1-NEXT: vmovaps 64(%rdi), %xmm3
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vmovaps 64(%rsi), %xmm10
+; AVX1-NEXT: vmovaps 64(%rdi), %xmm11
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0]
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps 32(%rcx), %xmm4
-; AVX1-NEXT: vmovaps 64(%rcx), %xmm5
-; AVX1-NEXT: vmovaps 64(%rdx), %xmm7
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm5[0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm3[1],xmm2[1]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm7[1],xmm5[1]
-; AVX1-NEXT: vmovaps 32(%rsi), %xmm5
-; AVX1-NEXT: vmovaps 32(%rdi), %xmm7
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm7[0],xmm5[0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm7[1],xmm5[1]
-; AVX1-NEXT: vmovaps 32(%rdx), %xmm7
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm11 = xmm7[0],xmm4[0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm7[1],xmm4[1]
-; AVX1-NEXT: vmovaps 96(%rsi), %xmm7
+; AVX1-NEXT: vmovaps 32(%rcx), %xmm12
+; AVX1-NEXT: vmovaps 64(%rcx), %xmm13
+; AVX1-NEXT: vmovaps 64(%rdx), %xmm14
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1]
+; AVX1-NEXT: vmovaps 32(%rsi), %xmm13
+; AVX1-NEXT: vmovaps 32(%rdi), %xmm14
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX1-NEXT: vmovaps 32(%rdx), %xmm14
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm14[0],xmm12[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX1-NEXT: vmovaps 96(%rsi), %xmm14
; AVX1-NEXT: vmovaps 96(%rdi), %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm7[0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm7[1]
-; AVX1-NEXT: vmovaps 96(%rcx), %xmm7
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm14[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm14[1]
+; AVX1-NEXT: vmovaps 96(%rcx), %xmm14
; AVX1-NEXT: vmovaps 96(%rdx), %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm7[0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm7[1]
-; AVX1-NEXT: vmovaps (%rsi), %xmm7
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm14[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm14[1]
+; AVX1-NEXT: vmovaps (%rsi), %xmm14
; AVX1-NEXT: vmovaps (%rdi), %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm7[0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm7[1]
-; AVX1-NEXT: vmovaps (%rcx), %xmm7
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm14[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm14[1]
+; AVX1-NEXT: vmovaps (%rcx), %xmm14
; AVX1-NEXT: vmovaps (%rdx), %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm7[0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm14[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1]
; AVX1-NEXT: vmovaps %xmm0, 48(%r8)
; AVX1-NEXT: vmovaps %xmm2, 32(%r8)
; AVX1-NEXT: vmovaps %xmm1, 16(%r8)
@@ -655,14 +652,14 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX1-NEXT: vmovaps %xmm4, 432(%r8)
; AVX1-NEXT: vmovaps %xmm6, 416(%r8)
; AVX1-NEXT: vmovaps %xmm5, 400(%r8)
-; AVX1-NEXT: vmovaps %xmm8, 384(%r8)
-; AVX1-NEXT: vmovaps %xmm9, 176(%r8)
-; AVX1-NEXT: vmovaps %xmm10, 160(%r8)
-; AVX1-NEXT: vmovaps %xmm11, 144(%r8)
-; AVX1-NEXT: vmovaps %xmm12, 128(%r8)
-; AVX1-NEXT: vmovaps %xmm13, 304(%r8)
-; AVX1-NEXT: vmovaps %xmm14, 288(%r8)
-; AVX1-NEXT: vmovaps %xmm15, 272(%r8)
+; AVX1-NEXT: vmovaps %xmm7, 384(%r8)
+; AVX1-NEXT: vmovaps %xmm12, 176(%r8)
+; AVX1-NEXT: vmovaps %xmm13, 160(%r8)
+; AVX1-NEXT: vmovaps %xmm8, 144(%r8)
+; AVX1-NEXT: vmovaps %xmm15, 128(%r8)
+; AVX1-NEXT: vmovaps %xmm11, 304(%r8)
+; AVX1-NEXT: vmovaps %xmm10, 288(%r8)
+; AVX1-NEXT: vmovaps %xmm9, 272(%r8)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 256(%r8)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -688,11 +685,11 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-LABEL: store_i64_stride4_vf16:
; AVX2: # %bb.0:
; AVX2-NEXT: subq $152, %rsp
-; AVX2-NEXT: vmovaps 96(%rdi), %ymm0
+; AVX2-NEXT: vmovaps 96(%rdi), %ymm7
; AVX2-NEXT: vmovaps 64(%rdi), %ymm5
; AVX2-NEXT: vmovaps 32(%rdi), %ymm3
; AVX2-NEXT: vmovaps (%rdi), %ymm1
-; AVX2-NEXT: vmovaps 96(%rsi), %ymm7
+; AVX2-NEXT: vmovaps 96(%rsi), %ymm8
; AVX2-NEXT: vmovaps 64(%rsi), %ymm6
; AVX2-NEXT: vmovaps 32(%rsi), %ymm4
; AVX2-NEXT: vmovaps (%rsi), %ymm2
@@ -702,88 +699,88 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vmovaps (%rdx), %ymm12
; AVX2-NEXT: vmovaps 96(%rcx), %ymm13
; AVX2-NEXT: vmovaps (%rcx), %ymm14
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm12[1],ymm14[1],ymm12[3],ymm14[3]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3]
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm8[2,3]
-; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps 64(%rcx), %ymm15
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm12[0],ymm14[0],ymm12[2],ymm14[2]
; AVX2-NEXT: vmovaps 32(%rcx), %ymm14
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm12[2,3]
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm12[2,3]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm11[1],ymm14[1],ymm11[3],ymm14[3]
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm2[2,3]
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm2[2,3]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm14[0],ymm11[2],ymm14[2]
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm11[2,3]
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm11[2,3]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm10[1],ymm15[1],ymm10[3],ymm15[3]
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm4[2,3]
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm4[2,3]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm15[0],ymm10[2],ymm15[2]
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm10[2,3]
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm10[2,3]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm9[1],ymm13[1],ymm9[3],ymm13[3]
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm0[1],ymm7[1],ymm0[3],ymm7[3]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm6[2,3]
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm6[2,3]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm13[0],ymm9[2],ymm13[2]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm9[2,3]
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 32(%rsi), %xmm2
-; AVX2-NEXT: vmovaps 64(%rsi), %xmm3
-; AVX2-NEXT: vmovaps 32(%rdi), %xmm4
-; AVX2-NEXT: vmovaps 64(%rdi), %xmm5
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm3[0]
+; AVX2-NEXT: vmovaps 32(%rsi), %xmm9
+; AVX2-NEXT: vmovaps 64(%rsi), %xmm10
+; AVX2-NEXT: vmovaps 32(%rdi), %xmm11
+; AVX2-NEXT: vmovaps 64(%rdi), %xmm12
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm10[0]
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 32(%rcx), %xmm6
-; AVX2-NEXT: vmovaps 64(%rcx), %xmm7
-; AVX2-NEXT: vmovaps 64(%rdx), %xmm0
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm5[1],xmm3[1]
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm7[0]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm7[1]
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm12 = xmm4[0],xmm2[0]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm4[1],xmm2[1]
-; AVX2-NEXT: vmovaps 32(%rdx), %xmm4
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm11 = xmm4[0],xmm6[0]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm4[1],xmm6[1]
-; AVX2-NEXT: vmovaps 96(%rsi), %xmm6
+; AVX2-NEXT: vmovaps 32(%rcx), %xmm13
+; AVX2-NEXT: vmovaps 64(%rcx), %xmm14
+; AVX2-NEXT: vmovaps 64(%rdx), %xmm15
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm12[1],xmm10[1]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm14[0]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm14[1]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm9[0]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm9[1]
+; AVX2-NEXT: vmovaps 32(%rdx), %xmm11
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm13[0]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1]
+; AVX2-NEXT: vmovaps 96(%rsi), %xmm13
; AVX2-NEXT: vmovaps 96(%rdi), %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm6[0]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm6[1]
-; AVX2-NEXT: vmovaps 96(%rcx), %xmm6
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm13[0]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm13[1]
+; AVX2-NEXT: vmovaps 96(%rcx), %xmm13
; AVX2-NEXT: vmovaps 96(%rdx), %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm6[0]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm6[1]
-; AVX2-NEXT: vmovaps (%rsi), %xmm6
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm13[0]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm13[1]
+; AVX2-NEXT: vmovaps (%rsi), %xmm13
; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm6[0]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm6[1]
-; AVX2-NEXT: vmovaps (%rcx), %xmm6
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm13[0]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm13[1]
+; AVX2-NEXT: vmovaps (%rcx), %xmm13
; AVX2-NEXT: vmovaps (%rdx), %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm6[0]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm13[0]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1]
; AVX2-NEXT: vmovaps %xmm0, 48(%r8)
; AVX2-NEXT: vmovaps %xmm2, 32(%r8)
; AVX2-NEXT: vmovaps %xmm1, 16(%r8)
; AVX2-NEXT: vmovaps %xmm3, (%r8)
; AVX2-NEXT: vmovaps %xmm4, 432(%r8)
-; AVX2-NEXT: vmovaps %xmm7, 416(%r8)
+; AVX2-NEXT: vmovaps %xmm6, 416(%r8)
; AVX2-NEXT: vmovaps %xmm5, 400(%r8)
-; AVX2-NEXT: vmovaps %xmm8, 384(%r8)
-; AVX2-NEXT: vmovaps %xmm9, 176(%r8)
-; AVX2-NEXT: vmovaps %xmm10, 160(%r8)
-; AVX2-NEXT: vmovaps %xmm11, 144(%r8)
-; AVX2-NEXT: vmovaps %xmm12, 128(%r8)
-; AVX2-NEXT: vmovaps %xmm13, 304(%r8)
-; AVX2-NEXT: vmovaps %xmm14, 288(%r8)
-; AVX2-NEXT: vmovaps %xmm15, 272(%r8)
+; AVX2-NEXT: vmovaps %xmm7, 384(%r8)
+; AVX2-NEXT: vmovaps %xmm11, 176(%r8)
+; AVX2-NEXT: vmovaps %xmm9, 160(%r8)
+; AVX2-NEXT: vmovaps %xmm8, 144(%r8)
+; AVX2-NEXT: vmovaps %xmm15, 128(%r8)
+; AVX2-NEXT: vmovaps %xmm14, 304(%r8)
+; AVX2-NEXT: vmovaps %xmm10, 288(%r8)
+; AVX2-NEXT: vmovaps %xmm12, 272(%r8)
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-NEXT: vmovaps %xmm0, 256(%r8)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
index 32d3440db85de..2d71de822058c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
@@ -13,24 +13,24 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE: # %bb.0:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: movaps (%rsi), %xmm8
+; SSE-NEXT: movaps (%rsi), %xmm1
; SSE-NEXT: movaps (%rdx), %xmm2
; SSE-NEXT: movaps (%rcx), %xmm3
; SSE-NEXT: movaps (%r8), %xmm4
; SSE-NEXT: movaps (%r9), %xmm5
; SSE-NEXT: movaps %xmm0, %xmm6
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0]
; SSE-NEXT: movaps %xmm4, %xmm7
; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0]
-; SSE-NEXT: movaps %xmm2, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
+; SSE-NEXT: movaps %xmm2, %xmm8
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE-NEXT: movaps %xmm2, 16(%rax)
; SSE-NEXT: movaps %xmm0, 48(%rax)
; SSE-NEXT: movaps %xmm4, 80(%rax)
-; SSE-NEXT: movaps %xmm1, 64(%rax)
+; SSE-NEXT: movaps %xmm8, 64(%rax)
; SSE-NEXT: movaps %xmm7, 32(%rax)
; SSE-NEXT: movaps %xmm6, (%rax)
; SSE-NEXT: retq
@@ -122,48 +122,48 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-LABEL: store_i64_stride6_vf4:
; SSE: # %bb.0:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: movaps 16(%rdi), %xmm15
-; SSE-NEXT: movaps (%rsi), %xmm8
-; SSE-NEXT: movaps 16(%rsi), %xmm10
+; SSE-NEXT: movaps (%rdi), %xmm2
+; SSE-NEXT: movaps 16(%rdi), %xmm0
+; SSE-NEXT: movaps (%rsi), %xmm5
+; SSE-NEXT: movaps 16(%rsi), %xmm7
; SSE-NEXT: movaps (%rdx), %xmm6
; SSE-NEXT: movaps 16(%rdx), %xmm1
-; SSE-NEXT: movaps (%rcx), %xmm9
-; SSE-NEXT: movaps 16(%rcx), %xmm5
-; SSE-NEXT: movaps (%r8), %xmm7
+; SSE-NEXT: movaps (%rcx), %xmm8
+; SSE-NEXT: movaps 16(%rcx), %xmm9
+; SSE-NEXT: movaps (%r8), %xmm10
; SSE-NEXT: movaps 16(%r8), %xmm4
; SSE-NEXT: movaps (%r9), %xmm11
-; SSE-NEXT: movaps 16(%r9), %xmm3
-; SSE-NEXT: movaps %xmm4, %xmm12
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1]
+; SSE-NEXT: movaps 16(%r9), %xmm12
+; SSE-NEXT: movaps %xmm4, %xmm3
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1]
; SSE-NEXT: movaps %xmm1, %xmm13
-; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1]
-; SSE-NEXT: movaps %xmm15, %xmm14
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm10[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm10[0]
-; SSE-NEXT: movaps %xmm7, %xmm3
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1]
-; SSE-NEXT: movaps %xmm6, %xmm5
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1]
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1]
-; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0]
-; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0]
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0]
-; SSE-NEXT: movaps %xmm0, (%rax)
+; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1]
+; SSE-NEXT: movaps %xmm0, %xmm14
+; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; SSE-NEXT: movaps %xmm10, %xmm7
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1]
+; SSE-NEXT: movaps %xmm6, %xmm9
+; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1]
+; SSE-NEXT: movaps %xmm2, %xmm12
+; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1]
+; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; SSE-NEXT: movaps %xmm2, (%rax)
; SSE-NEXT: movaps %xmm6, 16(%rax)
-; SSE-NEXT: movaps %xmm7, 32(%rax)
-; SSE-NEXT: movaps %xmm2, 48(%rax)
-; SSE-NEXT: movaps %xmm5, 64(%rax)
-; SSE-NEXT: movaps %xmm3, 80(%rax)
-; SSE-NEXT: movaps %xmm15, 96(%rax)
+; SSE-NEXT: movaps %xmm10, 32(%rax)
+; SSE-NEXT: movaps %xmm12, 48(%rax)
+; SSE-NEXT: movaps %xmm9, 64(%rax)
+; SSE-NEXT: movaps %xmm7, 80(%rax)
+; SSE-NEXT: movaps %xmm0, 96(%rax)
; SSE-NEXT: movaps %xmm1, 112(%rax)
; SSE-NEXT: movaps %xmm4, 128(%rax)
; SSE-NEXT: movaps %xmm14, 144(%rax)
; SSE-NEXT: movaps %xmm13, 160(%rax)
-; SSE-NEXT: movaps %xmm12, 176(%rax)
+; SSE-NEXT: movaps %xmm3, 176(%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i64_stride6_vf4:
@@ -171,18 +171,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-NEXT: vmovapd (%rdi), %ymm0
; AVX1-NEXT: vmovapd (%rsi), %ymm1
-; AVX1-NEXT: vmovaps (%rdx), %ymm8
+; AVX1-NEXT: vmovaps (%rdx), %ymm2
; AVX1-NEXT: vmovapd (%r8), %ymm3
; AVX1-NEXT: vmovapd (%r9), %ymm4
; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0]
; AVX1-NEXT: vmovaps (%rsi), %xmm6
; AVX1-NEXT: vmovaps (%rdi), %xmm7
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3]
-; AVX1-NEXT: vmovaps (%rcx), %xmm5
-; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm5, %ymm9
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX1-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0],ymm8[1,2,3]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3]
+; AVX1-NEXT: vmovaps (%rcx), %xmm8
+; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm8, %ymm9
; AVX1-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,3,2,3]
; AVX1-NEXT: vbroadcastsd 8(%r8), %ymm11
; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
@@ -191,70 +191,70 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[3]
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2]
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm3
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT: vmovapd 16(%rdx), %xmm3
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1]
-; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm8
-; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm6[0]
-; AVX1-NEXT: vmovaps (%rdx), %xmm6
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm5[0]
-; AVX1-NEXT: vmovaps %xmm5, 16(%rax)
-; AVX1-NEXT: vmovaps %xmm4, (%rax)
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm2
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vmovapd 16(%rdx), %xmm2
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1]
+; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm3
+; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm6[0]
+; AVX1-NEXT: vmovaps (%rdx), %xmm4
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0]
+; AVX1-NEXT: vmovaps %xmm4, 16(%rax)
+; AVX1-NEXT: vmovaps %xmm3, (%rax)
; AVX1-NEXT: vmovaps %ymm1, 96(%rax)
; AVX1-NEXT: vmovapd %ymm0, 128(%rax)
; AVX1-NEXT: vmovaps %ymm9, 64(%rax)
-; AVX1-NEXT: vmovapd %ymm2, 32(%rax)
-; AVX1-NEXT: vmovapd %ymm3, 160(%rax)
+; AVX1-NEXT: vmovapd %ymm5, 32(%rax)
+; AVX1-NEXT: vmovapd %ymm2, 160(%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_i64_stride6_vf4:
; AVX2: # %bb.0:
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: vmovaps (%rdi), %ymm8
-; AVX2-NEXT: vmovaps (%rsi), %ymm11
+; AVX2-NEXT: vmovaps (%rdi), %ymm0
+; AVX2-NEXT: vmovaps (%rsi), %ymm1
; AVX2-NEXT: vmovaps (%rdx), %ymm2
; AVX2-NEXT: vmovaps (%rcx), %ymm3
; AVX2-NEXT: vmovaps (%r8), %ymm4
; AVX2-NEXT: vmovaps (%r9), %xmm5
; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm6
; AVX2-NEXT: vmovaps (%rcx), %xmm7
-; AVX2-NEXT: vmovaps (%rdx), %xmm0
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm7[1]
+; AVX2-NEXT: vmovaps (%rdx), %xmm8
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1]
; AVX2-NEXT: vbroadcastsd 8(%r8), %ymm10
; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7]
-; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
; AVX2-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0]
-; AVX2-NEXT: vmovaps (%rsi), %xmm6
-; AVX2-NEXT: vmovaps (%rdi), %xmm1
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm1[1],xmm6[1]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[0,1],ymm10[0,1]
-; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5,6,7]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm8[0],ymm11[0],ymm8[2],ymm11[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3]
+; AVX2-NEXT: vmovaps (%rsi), %xmm9
+; AVX2-NEXT: vmovaps (%rdi), %xmm10
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[0,1],ymm11[0,1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6,7]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm3
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm11[1],ymm8[3],ymm11[3]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0]
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0]
-; AVX2-NEXT: vmovaps %xmm0, 16(%rax)
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
+; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm8[0],xmm7[0]
+; AVX2-NEXT: vmovaps %xmm3, 16(%rax)
; AVX2-NEXT: vmovaps %xmm1, (%rax)
-; AVX2-NEXT: vmovaps %ymm10, 96(%rax)
-; AVX2-NEXT: vmovaps %ymm3, 128(%rax)
+; AVX2-NEXT: vmovaps %ymm11, 96(%rax)
+; AVX2-NEXT: vmovaps %ymm0, 128(%rax)
; AVX2-NEXT: vmovaps %ymm2, 160(%rax)
; AVX2-NEXT: vmovaps %ymm5, 32(%rax)
-; AVX2-NEXT: vmovaps %ymm9, 64(%rax)
+; AVX2-NEXT: vmovaps %ymm6, 64(%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -309,96 +309,98 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-LABEL: store_i64_stride6_vf8:
; SSE: # %bb.0:
; SSE-NEXT: subq $24, %rsp
-; SSE-NEXT: movaps (%rdi), %xmm7
-; SSE-NEXT: movaps 16(%rdi), %xmm10
-; SSE-NEXT: movaps 32(%rdi), %xmm15
-; SSE-NEXT: movaps (%rsi), %xmm11
-; SSE-NEXT: movaps 16(%rsi), %xmm4
-; SSE-NEXT: movaps 32(%rsi), %xmm8
-; SSE-NEXT: movaps (%rdx), %xmm12
-; SSE-NEXT: movaps 16(%rdx), %xmm9
-; SSE-NEXT: movaps 32(%rdx), %xmm14
-; SSE-NEXT: movaps (%rcx), %xmm3
-; SSE-NEXT: movaps 16(%rcx), %xmm6
+; SSE-NEXT: movaps (%rdi), %xmm0
+; SSE-NEXT: movaps 16(%rdi), %xmm1
+; SSE-NEXT: movaps 32(%rdi), %xmm3
+; SSE-NEXT: movaps (%rsi), %xmm9
+; SSE-NEXT: movaps 16(%rsi), %xmm13
+; SSE-NEXT: movaps 32(%rsi), %xmm12
+; SSE-NEXT: movaps (%rdx), %xmm2
+; SSE-NEXT: movaps 16(%rdx), %xmm4
+; SSE-NEXT: movaps 32(%rdx), %xmm7
+; SSE-NEXT: movaps (%rcx), %xmm10
+; SSE-NEXT: movaps 16(%rcx), %xmm14
; SSE-NEXT: movaps (%r8), %xmm5
-; SSE-NEXT: movaps 16(%r8), %xmm13
-; SSE-NEXT: movaps (%r9), %xmm0
-; SSE-NEXT: movaps 16(%r9), %xmm1
-; SSE-NEXT: movaps %xmm7, %xmm2
-; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0]
-; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1]
-; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm12, %xmm7
-; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0]
-; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1]
-; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm5, %xmm3
-; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
-; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm10, %xmm11
-; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm4[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1]
-; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm9, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm6[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1]
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps %xmm13, %xmm10
-; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1]
-; SSE-NEXT: movaps %xmm15, %xmm0
-; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm8[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
+; SSE-NEXT: movaps 16(%r8), %xmm8
+; SSE-NEXT: movaps (%r9), %xmm11
+; SSE-NEXT: movaps 16(%r9), %xmm15
+; SSE-NEXT: movaps %xmm0, %xmm6
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0]
+; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 32(%rcx), %xmm1
-; SSE-NEXT: movaps %xmm14, %xmm12
-; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1]
-; SSE-NEXT: movaps 32(%r8), %xmm5
+; SSE-NEXT: movaps %xmm2, %xmm9
+; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1]
+; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm5, %xmm10
+; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1]
+; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm1, %xmm11
+; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1]
+; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm4, %xmm13
+; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm14[1]
+; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm8, %xmm14
+; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm15[1]
+; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps %xmm3, %xmm15
+; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm12[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1]
+; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 32(%rcx), %xmm12
+; SSE-NEXT: movaps %xmm7, %xmm8
+; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm12[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1]
+; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps 32(%r8), %xmm12
; SSE-NEXT: movaps 32(%r9), %xmm0
-; SSE-NEXT: movaps %xmm5, %xmm8
-; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
-; SSE-NEXT: movaps 48(%rdi), %xmm6
-; SSE-NEXT: movaps 48(%rsi), %xmm3
-; SSE-NEXT: movaps %xmm6, %xmm7
-; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
-; SSE-NEXT: movaps 48(%rdx), %xmm3
-; SSE-NEXT: movaps 48(%rcx), %xmm2
-; SSE-NEXT: movaps %xmm3, %xmm1
-; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE-NEXT: movaps 48(%r8), %xmm2
+; SSE-NEXT: movaps %xmm12, %xmm7
+; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1]
+; SSE-NEXT: movaps 48(%rdi), %xmm5
+; SSE-NEXT: movaps 48(%rsi), %xmm2
+; SSE-NEXT: movaps %xmm5, %xmm6
+; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-NEXT: movaps 48(%rdx), %xmm2
+; SSE-NEXT: movaps 48(%rcx), %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm1
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; SSE-NEXT: movaps 48(%r8), %xmm3
; SSE-NEXT: movaps 48(%r9), %xmm4
-; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movaps %xmm2, 368(%rax)
-; SSE-NEXT: movaps %xmm3, 352(%rax)
-; SSE-NEXT: movaps %xmm6, 336(%rax)
+; SSE-NEXT: movaps %xmm3, 368(%rax)
+; SSE-NEXT: movaps %xmm2, 352(%rax)
+; SSE-NEXT: movaps %xmm5, 336(%rax)
; SSE-NEXT: movaps %xmm0, 320(%rax)
; SSE-NEXT: movaps %xmm1, 304(%rax)
-; SSE-NEXT: movaps %xmm7, 288(%rax)
-; SSE-NEXT: movaps %xmm5, 272(%rax)
-; SSE-NEXT: movaps %xmm14, 256(%rax)
+; SSE-NEXT: movaps %xmm6, 288(%rax)
+; SSE-NEXT: movaps %xmm12, 272(%rax)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 256(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 240(%rax)
-; SSE-NEXT: movaps %xmm8, 224(%rax)
-; SSE-NEXT: movaps %xmm12, 208(%rax)
+; SSE-NEXT: movaps %xmm7, 224(%rax)
+; SSE-NEXT: movaps %xmm8, 208(%rax)
; SSE-NEXT: movaps %xmm15, 192(%rax)
-; SSE-NEXT: movaps %xmm13, 176(%rax)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 176(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 160(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 144(%rax)
-; SSE-NEXT: movaps %xmm10, 128(%rax)
-; SSE-NEXT: movaps %xmm9, 112(%rax)
+; SSE-NEXT: movaps %xmm14, 128(%rax)
+; SSE-NEXT: movaps %xmm13, 112(%rax)
; SSE-NEXT: movaps %xmm11, 96(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 80(%rax)
@@ -406,10 +408,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movaps %xmm0, 64(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 48(%rax)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 32(%rax)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 16(%rax)
+; SSE-NEXT: movaps %xmm10, 32(%rax)
+; SSE-NEXT: movaps %xmm9, 16(%rax)
; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, (%rax)
; SSE-NEXT: addq $24, %rsp
@@ -417,7 +417,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX1-LABEL: store_i64_stride6_vf8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovapd (%rdi), %ymm15
+; AVX1-NEXT: vmovapd (%rdi), %ymm7
; AVX1-NEXT: vmovapd 32(%rdi), %ymm12
; AVX1-NEXT: vmovapd (%rsi), %ymm9
; AVX1-NEXT: vmovapd 32(%rsi), %ymm13
@@ -437,75 +437,75 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-NEXT: vmovaps 32(%rsi), %xmm6
; AVX1-NEXT: vmovaps (%rdi), %xmm4
; AVX1-NEXT: vmovaps 16(%rdi), %xmm11
-; AVX1-NEXT: vmovaps 32(%rdi), %xmm0
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm6[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX1-NEXT: vblendpd {{.*#+}} ymm7 = ymm14[0],ymm7[1,2,3]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3]
-; AVX1-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovaps 32(%rdi), %xmm8
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm8[1],xmm6[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
+; AVX1-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0],ymm15[1,2,3]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3]
+; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
-; AVX1-NEXT: vbroadcastsd 16(%rcx), %ymm7
-; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm7[6,7]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm11 = ymm1[0],ymm7[1],ymm1[2,3]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm13[2,3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm7[2,3]
-; AVX1-NEXT: vshufpd {{.*#+}} ymm12 = ymm7[0],ymm1[0],ymm7[2],ymm1[3]
+; AVX1-NEXT: vbroadcastsd 16(%rcx), %ymm11
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm4[1],xmm3[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
+; AVX1-NEXT: vblendpd {{.*#+}} ymm15 = ymm10[0],ymm15[1,2,3]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm2[2,3],ymm13[2,3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm15[0],ymm12[2],ymm15[3]
; AVX1-NEXT: vmovaps 32(%rcx), %xmm14
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
-; AVX1-NEXT: vbroadcastsd 40(%r8), %ymm7
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7]
-; AVX1-NEXT: vinsertf128 $1, 32(%r9), %ymm14, %ymm7
-; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm7[2,3],ymm1[4,5],ymm7[6,7]
-; AVX1-NEXT: vmovapd (%r9), %ymm1
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm15[1],ymm9[1],ymm15[3],ymm9[3]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm1[2,3],ymm9[2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,3,2,3]
+; AVX1-NEXT: vbroadcastsd 40(%r8), %ymm15
+; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7]
+; AVX1-NEXT: vinsertf128 $1, 32(%r9), %ymm14, %ymm15
+; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7]
+; AVX1-NEXT: vmovapd (%r9), %ymm15
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3]
; AVX1-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3]
; AVX1-NEXT: vbroadcastsd 8(%r8), %ymm10
; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7]
; AVX1-NEXT: vmovaps (%rcx), %xmm10
-; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm10, %ymm15
-; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5],ymm15[6,7]
-; AVX1-NEXT: vmovapd 48(%rdx), %xmm5
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1]
-; AVX1-NEXT: vbroadcastsd 56(%r8), %ymm15
-; AVX1-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3]
-; AVX1-NEXT: vmovapd 16(%rdx), %xmm5
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1]
-; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm15
-; AVX1-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; AVX1-NEXT: vmovaps 32(%rdx), %xmm5
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0]
+; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm10, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7]
+; AVX1-NEXT: vmovapd 48(%rdx), %xmm9
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],mem[1]
+; AVX1-NEXT: vbroadcastsd 56(%r8), %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2],ymm9[3]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3]
+; AVX1-NEXT: vmovapd 16(%rdx), %xmm2
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1]
+; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm9
+; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm15[3]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm6[0]
+; AVX1-NEXT: vmovaps 32(%rdx), %xmm8
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0]
; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
; AVX1-NEXT: vmovaps (%rdx), %xmm4
; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm10[0]
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-NEXT: vmovaps %xmm4, 16(%rax)
; AVX1-NEXT: vmovaps %xmm3, (%rax)
-; AVX1-NEXT: vmovaps %xmm5, 208(%rax)
-; AVX1-NEXT: vmovaps %xmm0, 192(%rax)
-; AVX1-NEXT: vmovaps %ymm9, 64(%rax)
+; AVX1-NEXT: vmovaps %xmm8, 208(%rax)
+; AVX1-NEXT: vmovaps %xmm6, 192(%rax)
+; AVX1-NEXT: vmovaps %ymm0, 64(%rax)
; AVX1-NEXT: vmovapd %ymm7, 128(%rax)
; AVX1-NEXT: vmovaps %ymm13, 256(%rax)
; AVX1-NEXT: vmovapd %ymm12, 320(%rax)
; AVX1-NEXT: vmovapd %ymm11, 32(%rax)
-; AVX1-NEXT: vmovaps %ymm8, 96(%rax)
-; AVX1-NEXT: vmovapd %ymm1, 160(%rax)
+; AVX1-NEXT: vmovaps %ymm5, 96(%rax)
+; AVX1-NEXT: vmovapd %ymm2, 160(%rax)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm0, 224(%rax)
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vmovaps %ymm0, 288(%rax)
-; AVX1-NEXT: vmovapd %ymm2, 352(%rax)
+; AVX1-NEXT: vmovapd %ymm1, 352(%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -513,87 +513,87 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rax
; AVX2-NEXT: vmovaps 32(%rdx), %ymm7
-; AVX2-NEXT: vmovaps (%r8), %ymm11
+; AVX2-NEXT: vmovaps (%r8), %ymm4
; AVX2-NEXT: vmovaps 32(%r8), %ymm13
-; AVX2-NEXT: vmovaps (%r9), %xmm8
-; AVX2-NEXT: vmovaps 32(%r9), %xmm0
-; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX2-NEXT: vmovaps (%rcx), %xmm5
-; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 32(%rcx), %xmm15
-; AVX2-NEXT: vmovaps (%rdx), %xmm3
-; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovaps 32(%rdx), %xmm12
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm15[1]
-; AVX2-NEXT: vbroadcastsd 40(%r8), %ymm6
-; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = xmm0[0,0]
-; AVX2-NEXT: vmovaps (%rsi), %xmm4
-; AVX2-NEXT: vmovaps 32(%rsi), %xmm1
-; AVX2-NEXT: vmovaps (%rdi), %xmm6
-; AVX2-NEXT: vmovaps 32(%rdi), %xmm2
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm2[1],xmm1[1]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[0,1],ymm10[0,1]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vmovaps (%r9), %xmm14
+; AVX2-NEXT: vmovaps 32(%r9), %xmm6
+; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX2-NEXT: vmovaps (%rcx), %xmm1
+; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps 32(%rcx), %xmm3
+; AVX2-NEXT: vmovaps (%rdx), %xmm2
+; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps 32(%rdx), %xmm5
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm5[1],xmm3[1]
+; AVX2-NEXT: vbroadcastsd 40(%r8), %ymm9
+; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm3[1],xmm5[1]
-; AVX2-NEXT: vbroadcastsd 8(%r8), %ymm14
-; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5],ymm9[6,7]
-; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm14
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = xmm6[0,0]
+; AVX2-NEXT: vmovaps (%rsi), %xmm8
+; AVX2-NEXT: vmovaps 32(%rsi), %xmm11
+; AVX2-NEXT: vmovaps (%rdi), %xmm10
+; AVX2-NEXT: vmovaps 32(%rdi), %xmm12
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm12[1],xmm11[1]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm13[0,1],ymm9[0,1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm6[2,3],ymm9[4,5,6,7]
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = xmm8[0,0]
-; AVX2-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm6[1],xmm4[1]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm14[0,1]
-; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm3[2,3],ymm14[4,5,6,7]
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm8
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm8[2,3]
-; AVX2-NEXT: vmovaps 32(%rdi), %ymm8
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm2[1],xmm1[1]
+; AVX2-NEXT: vbroadcastsd 8(%r8), %ymm15
+; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5],ymm9[6,7]
+; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm15
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovddup {{.*#+}} xmm14 = xmm14[0,0]
+; AVX2-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm10[1],xmm8[1]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[0,1],ymm15[0,1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm0
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3],ymm0[2,3]
+; AVX2-NEXT: vmovaps 32(%rdi), %ymm15
; AVX2-NEXT: vmovaps 32(%rsi), %ymm0
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm8[1],ymm0[1],ymm8[3],ymm0[3]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm0[1],ymm15[3],ymm0[3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm6[2,3]
; AVX2-NEXT: vbroadcastsd 48(%r9), %ymm13
-; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
+; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3]
; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm7
-; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7]
; AVX2-NEXT: vmovaps (%rdx), %ymm7
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm13
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3]
-; AVX2-NEXT: vmovaps (%rdi), %ymm13
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm15
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3]
+; AVX2-NEXT: vmovaps (%rdi), %ymm15
; AVX2-NEXT: vmovaps (%rsi), %ymm0
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm13[1],ymm0[1],ymm13[3],ymm0[3]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3]
-; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm11
-; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[2],ymm0[2]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm15[1],ymm0[1],ymm15[3],ymm0[3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm9
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3]
; AVX2-NEXT: vbroadcastsd 16(%rcx), %ymm7
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm12[0],xmm15[0]
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm6 = xmm6[0],mem[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm5[0],xmm3[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm10[0],xmm8[0]
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm8 = xmm8[0],mem[0]
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7]
-; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX2-NEXT: vmovaps %xmm6, 16(%rax)
-; AVX2-NEXT: vmovaps %xmm4, (%rax)
-; AVX2-NEXT: vmovaps %xmm2, 208(%rax)
-; AVX2-NEXT: vmovaps %xmm1, 192(%rax)
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],mem[6,7]
+; AVX2-NEXT: vmovaps %xmm8, 16(%rax)
+; AVX2-NEXT: vmovaps %xmm5, (%rax)
+; AVX2-NEXT: vmovaps %xmm3, 208(%rax)
+; AVX2-NEXT: vmovaps %xmm7, 192(%rax)
; AVX2-NEXT: vmovaps %ymm0, 96(%rax)
-; AVX2-NEXT: vmovaps %ymm9, 128(%rax)
-; AVX2-NEXT: vmovaps %ymm7, 160(%rax)
-; AVX2-NEXT: vmovaps %ymm3, 288(%rax)
-; AVX2-NEXT: vmovaps %ymm10, 320(%rax)
-; AVX2-NEXT: vmovaps %ymm5, 352(%rax)
+; AVX2-NEXT: vmovaps %ymm4, 128(%rax)
+; AVX2-NEXT: vmovaps %ymm9, 160(%rax)
+; AVX2-NEXT: vmovaps %ymm1, 288(%rax)
+; AVX2-NEXT: vmovaps %ymm6, 320(%rax)
+; AVX2-NEXT: vmovaps %ymm2, 352(%rax)
; AVX2-NEXT: vmovaps %ymm14, 32(%rax)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 64(%rax)
@@ -607,23 +607,23 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512-LABEL: store_i64_stride6_vf8:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqu64 (%rdi), %zmm4
; AVX512-NEXT: vmovdqu64 (%rsi), %zmm6
; AVX512-NEXT: vmovdqu64 (%rdx), %zmm2
; AVX512-NEXT: vmovdqu64 (%rcx), %zmm3
-; AVX512-NEXT: vmovdqu64 (%r8), %zmm10
+; AVX512-NEXT: vmovdqu64 (%r8), %zmm1
; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,4,12>
; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512-NEXT: movb $12, %al
-; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: movb $12, %r10b
+; AVX512-NEXT: kmovd %r10d, %k1
; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1}
-; AVX512-NEXT: movb $16, %al
-; AVX512-NEXT: kmovd %eax, %k2
-; AVX512-NEXT: vmovdqa64 %zmm10, %zmm0 {%k2}
+; AVX512-NEXT: movb $16, %r10b
+; AVX512-NEXT: kmovd %r10d, %k2
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
; AVX512-NEXT: vmovdqu64 (%r9), %zmm5
; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10]
; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
@@ -631,11 +631,11 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10]
; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512-NEXT: movb $48, %al
-; AVX512-NEXT: kmovd %eax, %k2
+; AVX512-NEXT: movb $48, %r9b
+; AVX512-NEXT: kmovd %r9d, %k2
; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2}
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7>
-; AVX512-NEXT: vpermi2q %zmm10, %zmm8, %zmm7
+; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm7
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7]
; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm8
; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14]
@@ -646,47 +646,47 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm9
; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2}
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7>
-; AVX512-NEXT: vpermi2q %zmm10, %zmm9, %zmm7
+; AVX512-NEXT: vpermi2q %zmm1, %zmm9, %zmm7
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7]
; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm9
; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9]
; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm7
-; AVX512-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k1}
-; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm1
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,8,6,7]
-; AVX512-NEXT: vpermi2q %zmm5, %zmm1, %zmm7
-; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,7,15,7,15,7,15]
-; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm1
+; AVX512-NEXT: vmovdqa (%rdx), %xmm10
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
+; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1}
+; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7]
+; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm10
+; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15]
+; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm7
; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,7,15>
; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm11
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm11[0,1,2,3],zmm1[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <14,u,2,3,4,5,15,u>
-; AVX512-NEXT: vpermi2q %zmm10, %zmm1, %zmm4
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,14,2,3,4,5,6,15]
-; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
-; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11]
-; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u>
+; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm6
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15]
+; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm4
+; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11]
+; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm6
; AVX512-NEXT: vmovdqa (%rdi), %ymm2
; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
-; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u>
-; AVX512-NEXT: vpermi2q %zmm10, %zmm2, %zmm3
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,10,2,3,4,5,6,11]
-; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm2
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,12,6,7]
-; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm3
-; AVX512-NEXT: vmovdqu64 %zmm3, 192(%r10)
-; AVX512-NEXT: vmovdqu64 %zmm2, 128(%r10)
-; AVX512-NEXT: vmovdqu64 %zmm1, 320(%r10)
-; AVX512-NEXT: vmovdqu64 %zmm9, 256(%r10)
-; AVX512-NEXT: vmovdqu64 %zmm8, 64(%r10)
-; AVX512-NEXT: vmovdqu64 %zmm7, (%r10)
+; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11]
+; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm1
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7]
+; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm2
+; AVX512-NEXT: vmovdqu64 %zmm2, 192(%rax)
+; AVX512-NEXT: vmovdqu64 %zmm1, 128(%rax)
+; AVX512-NEXT: vmovdqu64 %zmm4, 320(%rax)
+; AVX512-NEXT: vmovdqu64 %zmm9, 256(%rax)
+; AVX512-NEXT: vmovdqu64 %zmm8, 64(%rax)
+; AVX512-NEXT: vmovdqu64 %zmm10, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 32
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index 398b8fbe1d330..e8e476b1a5614 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -348,161 +348,161 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
; SSE-LABEL: store_i8_stride3_vf32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm9
-; SSE-NEXT: movdqa 16(%rdi), %xmm11
-; SSE-NEXT: movdqa (%rsi), %xmm13
-; SSE-NEXT: movdqa 16(%rsi), %xmm7
-; SSE-NEXT: movdqa (%rdx), %xmm8
-; SSE-NEXT: movdqa 16(%rdx), %xmm10
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
+; SSE-NEXT: movdqa (%rdi), %xmm2
+; SSE-NEXT: movdqa 16(%rdi), %xmm8
+; SSE-NEXT: movdqa (%rsi), %xmm4
+; SSE-NEXT: movdqa 16(%rsi), %xmm10
+; SSE-NEXT: movdqa (%rdx), %xmm1
+; SSE-NEXT: movdqa 16(%rdx), %xmm7
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5]
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,1,2,3]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,1,2,3]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,6]
+; SSE-NEXT: pand %xmm0, %xmm6
+; SSE-NEXT: por %xmm5, %xmm6
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[2,1,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm5, %xmm12
-; SSE-NEXT: pandn %xmm2, %xmm12
-; SSE-NEXT: por %xmm1, %xmm12
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,1,2,3]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,1,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
+; SSE-NEXT: pand %xmm5, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: pandn %xmm9, %xmm3
+; SSE-NEXT: por %xmm6, %xmm3
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,6,5]
+; SSE-NEXT: movdqa %xmm0, %xmm9
+; SSE-NEXT: pandn %xmm6, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,1,2,3]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm6[0,1,2,3,4,5,5,6]
+; SSE-NEXT: pand %xmm0, %xmm11
+; SSE-NEXT: por %xmm9, %xmm11
+; SSE-NEXT: pand %xmm5, %xmm11
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,4,4]
; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: pandn %xmm2, %xmm6
-; SSE-NEXT: por %xmm1, %xmm6
-; SSE-NEXT: movdqa %xmm7, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,2,2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: por %xmm2, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,6,5,7,7]
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: pandn %xmm4, %xmm1
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,2]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: pandn %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,0,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6]
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,6]
+; SSE-NEXT: pandn %xmm9, %xmm6
+; SSE-NEXT: por %xmm11, %xmm6
+; SSE-NEXT: movdqa %xmm10, %xmm9
+; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,2,2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm11
+; SSE-NEXT: pandn %xmm9, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,7,7,7,7]
+; SSE-NEXT: pand %xmm0, %xmm12
+; SSE-NEXT: por %xmm11, %xmm12
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
+; SSE-NEXT: pand %xmm9, %xmm12
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,1,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7]
+; SSE-NEXT: movdqa %xmm9, %xmm11
+; SSE-NEXT: pandn %xmm13, %xmm11
+; SSE-NEXT: por %xmm12, %xmm11
+; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,2]
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,2,4,5,6,7]
+; SSE-NEXT: movdqa %xmm5, %xmm12
+; SSE-NEXT: pandn %xmm10, %xmm12
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,2,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,6,6]
+; SSE-NEXT: pand %xmm5, %xmm8
+; SSE-NEXT: por %xmm12, %xmm8
+; SSE-NEXT: pand %xmm0, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,6,6]
; SSE-NEXT: movdqa %xmm0, %xmm7
-; SSE-NEXT: pandn %xmm4, %xmm7
-; SSE-NEXT: por %xmm3, %xmm7
-; SSE-NEXT: movdqa %xmm13, %xmm3
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7]
-; SSE-NEXT: pandn %xmm4, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,2]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,0,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,6]
-; SSE-NEXT: pand %xmm5, %xmm4
-; SSE-NEXT: pandn %xmm3, %xmm5
-; SSE-NEXT: por %xmm4, %xmm5
+; SSE-NEXT: pandn %xmm10, %xmm7
+; SSE-NEXT: por %xmm8, %xmm7
+; SSE-NEXT: movdqa %xmm4, %xmm8
+; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,2,2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm10
+; SSE-NEXT: pandn %xmm8, %xmm10
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7]
+; SSE-NEXT: pand %xmm0, %xmm8
+; SSE-NEXT: por %xmm10, %xmm8
+; SSE-NEXT: pand %xmm9, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,1,2,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,7,7]
+; SSE-NEXT: pandn %xmm10, %xmm9
+; SSE-NEXT: por %xmm8, %xmm9
+; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,2]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,6]
+; SSE-NEXT: pand %xmm5, %xmm2
+; SSE-NEXT: pandn %xmm4, %xmm5
+; SSE-NEXT: por %xmm2, %xmm5
; SSE-NEXT: pand %xmm0, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,1,0,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6]
-; SSE-NEXT: pandn %xmm3, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,6]
+; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: por %xmm5, %xmm0
; SSE-NEXT: movdqa %xmm0, (%rcx)
-; SSE-NEXT: movdqa %xmm2, 32(%rcx)
+; SSE-NEXT: movdqa %xmm9, 32(%rcx)
; SSE-NEXT: movdqa %xmm7, 48(%rcx)
-; SSE-NEXT: movdqa %xmm1, 80(%rcx)
+; SSE-NEXT: movdqa %xmm11, 80(%rcx)
; SSE-NEXT: movdqa %xmm6, 16(%rcx)
-; SSE-NEXT: movdqa %xmm12, 64(%rcx)
+; SSE-NEXT: movdqa %xmm3, 64(%rcx)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i8_stride3_vf32:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX1-NEXT: vmovdqa (%rsi), %xmm8
+; AVX1-NEXT: vmovdqa (%rsi), %xmm2
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vmovdqa (%rdx), %xmm6
; AVX1-NEXT: vmovdqa 16(%rdx), %xmm7
-; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm5
-; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, 64(%rcx)
-; AVX1-NEXT: vmovdqa %xmm1, 80(%rcx)
-; AVX1-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, 64(%rcx)
+; AVX1-NEXT: vmovdqa %xmm4, 80(%rcx)
+; AVX1-NEXT: vmovdqa %xmm5, 32(%rcx)
; AVX1-NEXT: vmovdqa %xmm3, 48(%rcx)
-; AVX1-NEXT: vmovdqa %xmm6, (%rcx)
-; AVX1-NEXT: vmovdqa %xmm5, 16(%rcx)
+; AVX1-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rcx)
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_i8_stride3_vf32:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll
index 15a8b33693c19..ec65d22bfe861 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll
@@ -278,43 +278,43 @@ define void @store_i8_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: movdqa 16(%rdi), %xmm1
; SSE-NEXT: movdqa (%rsi), %xmm5
-; SSE-NEXT: movdqa 16(%rsi), %xmm8
-; SSE-NEXT: movdqa (%rdx), %xmm3
+; SSE-NEXT: movdqa 16(%rsi), %xmm6
+; SSE-NEXT: movdqa (%rdx), %xmm7
; SSE-NEXT: movdqa 16(%rdx), %xmm4
-; SSE-NEXT: movdqa (%rcx), %xmm6
+; SSE-NEXT: movdqa (%rcx), %xmm8
; SSE-NEXT: movdqa 16(%rcx), %xmm9
-; SSE-NEXT: movdqa %xmm3, %xmm7
-; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE-NEXT: movdqa %xmm7, %xmm10
+; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE-NEXT: movdqa %xmm2, %xmm10
-; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE-NEXT: movdqa %xmm4, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE-NEXT: movdqa %xmm1, %xmm6
-; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
-; SSE-NEXT: movdqa %xmm6, %xmm7
-; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; SSE-NEXT: movdqa %xmm4, %xmm7
+; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE-NEXT: movdqa %xmm1, 96(%r8)
-; SSE-NEXT: movdqa %xmm3, 112(%r8)
-; SSE-NEXT: movdqa %xmm6, 64(%r8)
-; SSE-NEXT: movdqa %xmm7, 80(%r8)
+; SSE-NEXT: movdqa %xmm6, 112(%r8)
+; SSE-NEXT: movdqa %xmm8, 64(%r8)
+; SSE-NEXT: movdqa %xmm10, 80(%r8)
; SSE-NEXT: movdqa %xmm0, 32(%r8)
; SSE-NEXT: movdqa %xmm5, 48(%r8)
; SSE-NEXT: movdqa %xmm2, (%r8)
-; SSE-NEXT: movdqa %xmm10, 16(%r8)
+; SSE-NEXT: movdqa %xmm3, 16(%r8)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i8_stride4_vf32:
@@ -325,32 +325,32 @@ define void @store_i8_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
; AVX1-NEXT: vmovdqa (%rcx), %xmm2
; AVX1-NEXT: vmovdqa 16(%rcx), %xmm3
; AVX1-NEXT: vmovdqa (%rdx), %xmm6
; AVX1-NEXT: vmovdqa 16(%rdx), %xmm7
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vmovaps %ymm0, (%r8)
-; AVX1-NEXT: vmovaps %ymm3, 96(%r8)
-; AVX1-NEXT: vmovaps %ymm2, 32(%r8)
-; AVX1-NEXT: vmovaps %ymm1, 64(%r8)
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
+; AVX1-NEXT: vmovaps %ymm2, (%r8)
+; AVX1-NEXT: vmovaps %ymm1, 96(%r8)
+; AVX1-NEXT: vmovaps %ymm0, 32(%r8)
+; AVX1-NEXT: vmovaps %ymm3, 64(%r8)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index bb0f1a9e94758..a7a172cd7e071 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -249,17 +249,17 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; SSE-LABEL: store_i8_stride6_vf8:
; SSE: # %bb.0:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq {{.*#+}} xmm9 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
-; SSE-NEXT: movq {{.*#+}} xmm10 = mem[0],zero
; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; SSE-NEXT: movq {{.*#+}} xmm8 = mem[0],zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: movdqa %xmm8, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; SSE-NEXT: movdqa %xmm3, %xmm5
@@ -269,43 +269,43 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
; SSE-NEXT: movdqa %xmm6, %xmm7
; SSE-NEXT: pandn %xmm5, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,1,1]
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: pand %xmm1, %xmm5
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: por %xmm5, %xmm2
-; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: por %xmm7, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1]
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[1,0,2,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1]
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: pandn %xmm9, %xmm10
+; SSE-NEXT: por %xmm5, %xmm10
+; SSE-NEXT: pand %xmm6, %xmm10
+; SSE-NEXT: por %xmm7, %xmm10
; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE-NEXT: packuswb %xmm3, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,3,3,3]
-; SSE-NEXT: movdqa %xmm6, %xmm4
-; SSE-NEXT: pandn %xmm3, %xmm4
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,3]
-; SSE-NEXT: pand %xmm6, %xmm3
-; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; SSE-NEXT: movdqa %xmm6, %xmm5
+; SSE-NEXT: pandn %xmm4, %xmm5
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,5,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3]
+; SSE-NEXT: pand %xmm6, %xmm4
+; SSE-NEXT: por %xmm5, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: pandn %xmm3, %xmm8
+; SSE-NEXT: por %xmm4, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
; SSE-NEXT: pand %xmm6, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
-; SSE-NEXT: pandn %xmm3, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; SSE-NEXT: pandn %xmm1, %xmm6
; SSE-NEXT: por %xmm0, %xmm6
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535]
; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,2,2]
-; SSE-NEXT: pandn %xmm3, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2]
+; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: por %xmm6, %xmm0
-; SSE-NEXT: movdqa %xmm1, 32(%rax)
+; SSE-NEXT: movdqa %xmm8, 32(%rax)
; SSE-NEXT: movdqa %xmm0, 16(%rax)
-; SSE-NEXT: movdqa %xmm2, (%rax)
+; SSE-NEXT: movdqa %xmm10, (%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i8_stride6_vf8:
@@ -429,142 +429,141 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
; SSE-LABEL: store_i8_stride6_vf16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm9
-; SSE-NEXT: movdqa (%rsi), %xmm5
-; SSE-NEXT: movdqa (%rdx), %xmm11
-; SSE-NEXT: movdqa (%rcx), %xmm14
-; SSE-NEXT: movdqa (%r8), %xmm15
-; SSE-NEXT: movdqa (%r9), %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm9, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE-NEXT: movdqa (%rdi), %xmm10
+; SSE-NEXT: movdqa (%rsi), %xmm15
+; SSE-NEXT: movdqa (%rdx), %xmm9
+; SSE-NEXT: movdqa (%rcx), %xmm13
+; SSE-NEXT: movdqa (%r8), %xmm7
+; SSE-NEXT: movdqa (%r9), %xmm14
+; SSE-NEXT: movdqa %xmm10, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
-; SSE-NEXT: movdqa %xmm1, %xmm10
+; SSE-NEXT: movdqa %xmm1, %xmm3
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: pand %xmm7, %xmm0
-; SSE-NEXT: movdqa %xmm11, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm9, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[1,0,2,2,4,5,6,7]
-; SSE-NEXT: movdqa %xmm2, %xmm6
+; SSE-NEXT: movdqa %xmm2, %xmm8
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE-NEXT: movdqa %xmm7, %xmm2
+; SSE-NEXT: movdqa %xmm4, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm15, %xmm12
-; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: por %xmm2, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0]
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm7, %xmm5
+; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0]
+; SSE-NEXT: movdqa %xmm0, %xmm11
+; SSE-NEXT: pandn %xmm1, %xmm11
+; SSE-NEXT: por %xmm2, %xmm11
+; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; SSE-NEXT: pand %xmm6, %xmm11
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE-NEXT: movdqa %xmm2, %xmm12
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm6, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
+; SSE-NEXT: por %xmm11, %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3]
+; SSE-NEXT: movdqa %xmm0, %xmm11
+; SSE-NEXT: pandn %xmm1, %xmm11
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,6,7,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,2,3,3]
-; SSE-NEXT: movdqa %xmm7, %xmm6
-; SSE-NEXT: pandn %xmm3, %xmm6
-; SSE-NEXT: pand %xmm7, %xmm1
-; SSE-NEXT: por %xmm1, %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,2,3,3]
-; SSE-NEXT: movdqa %xmm13, %xmm10
-; SSE-NEXT: pandn %xmm3, %xmm10
-; SSE-NEXT: pand %xmm13, %xmm6
-; SSE-NEXT: por %xmm6, %xmm10
-; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm0, %xmm6
-; SSE-NEXT: pandn %xmm3, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,2,2]
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: por %xmm3, %xmm6
+; SSE-NEXT: por %xmm11, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,2,3,3]
+; SSE-NEXT: movdqa %xmm4, %xmm8
+; SSE-NEXT: pandn %xmm11, %xmm8
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: por %xmm1, %xmm8
+; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3]
+; SSE-NEXT: movdqa %xmm11, %xmm12
+; SSE-NEXT: pandn %xmm1, %xmm12
+; SSE-NEXT: pand %xmm11, %xmm8
+; SSE-NEXT: por %xmm8, %xmm12
+; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: pandn %xmm1, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: por %xmm1, %xmm8
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,2,2]
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pandn %xmm3, %xmm5
-; SSE-NEXT: pand %xmm1, %xmm6
-; SSE-NEXT: por %xmm6, %xmm5
-; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; SSE-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,2,2]
-; SSE-NEXT: movdqa %xmm14, %xmm2
-; SSE-NEXT: pandn %xmm6, %xmm2
-; SSE-NEXT: pand %xmm14, %xmm5
-; SSE-NEXT: por %xmm5, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[3,3,3,3]
-; SSE-NEXT: movdqa %xmm0, %xmm6
-; SSE-NEXT: pandn %xmm5, %xmm6
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,5,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,3]
-; SSE-NEXT: pand %xmm0, %xmm5
-; SSE-NEXT: por %xmm6, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[2,2,3,3]
-; SSE-NEXT: movdqa %xmm7, %xmm4
-; SSE-NEXT: pandn %xmm6, %xmm4
-; SSE-NEXT: pand %xmm7, %xmm5
-; SSE-NEXT: por %xmm5, %xmm4
-; SSE-NEXT: pand %xmm13, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3]
-; SSE-NEXT: pandn %xmm5, %xmm13
-; SSE-NEXT: por %xmm4, %xmm13
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,1,1]
-; SSE-NEXT: pand %xmm7, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[1,0,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
-; SSE-NEXT: pandn %xmm5, %xmm7
-; SSE-NEXT: por %xmm4, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,0,0,0]
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: pandn %xmm4, %xmm5
-; SSE-NEXT: pand %xmm0, %xmm7
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; SSE-NEXT: pandn %xmm3, %xmm8
-; SSE-NEXT: por %xmm5, %xmm8
-; SSE-NEXT: pshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; SSE-NEXT: # xmm3 = mem[1,1,2,2]
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: # xmm4 = mem[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; SSE-NEXT: pandn %xmm4, %xmm0
-; SSE-NEXT: por %xmm3, %xmm0
+; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[1,1,2,2]
+; SSE-NEXT: movdqa %xmm1, %xmm13
+; SSE-NEXT: pandn %xmm15, %xmm13
+; SSE-NEXT: pand %xmm1, %xmm8
+; SSE-NEXT: por %xmm8, %xmm13
+; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15]
+; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,2,2]
+; SSE-NEXT: movdqa %xmm15, %xmm14
+; SSE-NEXT: pandn %xmm2, %xmm14
+; SSE-NEXT: pand %xmm15, %xmm13
+; SSE-NEXT: por %xmm13, %xmm14
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3]
+; SSE-NEXT: movdqa %xmm0, %xmm13
+; SSE-NEXT: pandn %xmm2, %xmm13
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,3]
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: por %xmm13, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,3,3]
+; SSE-NEXT: movdqa %xmm4, %xmm3
+; SSE-NEXT: pandn %xmm13, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: por %xmm2, %xmm3
+; SSE-NEXT: pand %xmm11, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3]
+; SSE-NEXT: pandn %xmm2, %xmm11
+; SSE-NEXT: por %xmm3, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1]
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[1,0,2,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; SSE-NEXT: pandn %xmm3, %xmm4
+; SSE-NEXT: por %xmm2, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,0,0]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pandn %xmm2, %xmm3
+; SSE-NEXT: pand %xmm0, %xmm4
+; SSE-NEXT: por %xmm4, %xmm3
+; SSE-NEXT: pand %xmm6, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,0,0,0]
+; SSE-NEXT: pandn %xmm2, %xmm6
+; SSE-NEXT: por %xmm3, %xmm6
+; SSE-NEXT: pshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[1,1,2,2]
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT: # xmm3 = mem[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
+; SSE-NEXT: pandn %xmm3, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,2,2]
-; SSE-NEXT: pandn %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,2,2]
+; SSE-NEXT: pandn %xmm2, %xmm1
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pand %xmm14, %xmm1
+; SSE-NEXT: pand %xmm15, %xmm1
; SSE-NEXT: pshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[1,1,2,2]
-; SSE-NEXT: pandn %xmm0, %xmm14
-; SSE-NEXT: por %xmm1, %xmm14
+; SSE-NEXT: pandn %xmm0, %xmm15
+; SSE-NEXT: por %xmm1, %xmm15
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movdqa %xmm14, 16(%rax)
-; SSE-NEXT: movdqa %xmm8, 48(%rax)
-; SSE-NEXT: movdqa %xmm13, 80(%rax)
-; SSE-NEXT: movdqa %xmm2, 64(%rax)
-; SSE-NEXT: movdqa %xmm10, 32(%rax)
+; SSE-NEXT: movdqa %xmm15, 16(%rax)
+; SSE-NEXT: movdqa %xmm6, 48(%rax)
+; SSE-NEXT: movdqa %xmm11, 80(%rax)
+; SSE-NEXT: movdqa %xmm14, 64(%rax)
+; SSE-NEXT: movdqa %xmm12, 32(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, (%rax)
; SSE-NEXT: retq
@@ -572,60 +571,60 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-LABEL: store_i8_stride6_vf16:
; AVX1: # %bb.0:
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: vmovdqa (%rdi), %xmm10
+; AVX1-NEXT: vmovdqa (%rdi), %xmm1
; AVX1-NEXT: vmovdqa (%rsi), %xmm2
; AVX1-NEXT: vmovdqa (%rdx), %xmm3
; AVX1-NEXT: vmovdqa (%rcx), %xmm4
-; AVX1-NEXT: vmovdqa (%r8), %xmm8
-; AVX1-NEXT: vmovdqa (%r9), %xmm9
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
+; AVX1-NEXT: vmovdqa (%r8), %xmm5
+; AVX1-NEXT: vmovdqa (%r9), %xmm6
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3,4],xmm6[5],xmm0[6,7]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[1,0,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3,4],xmm9[5],xmm0[6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5],xmm10[6],xmm0[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[1,0,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6],xmm11[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[1,0,2,2,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,0,0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[3,3,3,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6],xmm5[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,5,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6],xmm5[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,3,3,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,6,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,6,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,2,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX1-NEXT: vmovaps %ymm1, 64(%rax)
-; AVX1-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX1-NEXT: vmovaps %ymm11, (%rax)
+; AVX1-NEXT: vmovaps %ymm2, 32(%rax)
+; AVX1-NEXT: vmovaps %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -738,271 +737,268 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
; SSE-LABEL: store_i8_stride6_vf32:
; SSE: # %bb.0:
-; SSE-NEXT: subq $56, %rsp
-; SSE-NEXT: movdqa 16(%rdi), %xmm11
-; SSE-NEXT: movdqa 16(%rsi), %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rdx), %xmm8
-; SSE-NEXT: movdqa 16(%rcx), %xmm5
-; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%r8), %xmm13
-; SSE-NEXT: movdqa 16(%r9), %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm11, %xmm10
-; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3]
+; SSE-NEXT: subq $40, %rsp
+; SSE-NEXT: movdqa 16(%rdi), %xmm10
+; SSE-NEXT: movdqa 16(%rsi), %xmm9
+; SSE-NEXT: movdqa 16(%rdx), %xmm13
+; SSE-NEXT: movdqa 16(%rcx), %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 16(%r8), %xmm2
+; SSE-NEXT: movdqa 16(%r9), %xmm11
+; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm10, %xmm4
+; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3]
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm8, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,2,3]
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: movdqa %xmm13, %xmm7
+; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,5,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,2,2,3]
; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: por %xmm2, %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: movdqa %xmm13, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3]
-; SSE-NEXT: movdqa %xmm2, %xmm7
-; SSE-NEXT: pandn %xmm5, %xmm7
-; SSE-NEXT: por %xmm6, %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; SSE-NEXT: pand %xmm15, %xmm7
-; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3]
-; SSE-NEXT: movdqa %xmm15, %xmm3
-; SSE-NEXT: pandn %xmm6, %xmm3
-; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: por %xmm5, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: pand %xmm1, %xmm6
+; SSE-NEXT: movdqa %xmm2, %xmm8
+; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3]
+; SSE-NEXT: movdqa %xmm1, %xmm15
+; SSE-NEXT: pandn %xmm5, %xmm15
+; SSE-NEXT: por %xmm6, %xmm15
+; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; SSE-NEXT: pand %xmm12, %xmm15
+; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,2,3,3]
+; SSE-NEXT: movdqa %xmm12, %xmm3
+; SSE-NEXT: pandn %xmm5, %xmm3
+; SSE-NEXT: por %xmm15, %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm0, %xmm7
-; SSE-NEXT: pandn %xmm6, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,2,2]
-; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: por %xmm6, %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,2,2]
-; SSE-NEXT: movdqa %xmm12, %xmm6
-; SSE-NEXT: pandn %xmm9, %xmm6
-; SSE-NEXT: pand %xmm12, %xmm7
-; SSE-NEXT: por %xmm7, %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
-; SSE-NEXT: movdqa %xmm14, %xmm3
-; SSE-NEXT: pandn %xmm7, %xmm3
-; SSE-NEXT: pand %xmm14, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm0, %xmm15
+; SSE-NEXT: pandn %xmm5, %xmm15
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,2,2]
+; SSE-NEXT: pand %xmm0, %xmm5
+; SSE-NEXT: por %xmm5, %xmm15
+; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,0,65535]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,2]
+; SSE-NEXT: movdqa %xmm11, %xmm6
+; SSE-NEXT: pandn %xmm5, %xmm6
+; SSE-NEXT: pand %xmm11, %xmm15
+; SSE-NEXT: por %xmm15, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[1,1,2,2]
+; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: pandn %xmm15, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm6
; SSE-NEXT: por %xmm6, %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: pandn %xmm4, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,0,1,1]
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: por %xmm4, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm1, %xmm4
-; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: por %xmm6, %xmm4
-; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0]
-; SSE-NEXT: movdqa %xmm10, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: pand %xmm10, %xmm4
-; SSE-NEXT: por %xmm4, %xmm3
; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill
-; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; SSE-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm1, %xmm4
-; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: por %xmm4, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3]
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: pandn %xmm4, %xmm5
-; SSE-NEXT: por %xmm1, %xmm5
-; SSE-NEXT: pand %xmm15, %xmm5
-; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3]
-; SSE-NEXT: movdqa %xmm15, %xmm3
-; SSE-NEXT: pandn %xmm4, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: pandn %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,2,2]
-; SSE-NEXT: pand %xmm0, %xmm4
-; SSE-NEXT: por %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,1,2,2]
-; SSE-NEXT: movdqa %xmm12, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[1,0,2,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; SSE-NEXT: movdqa %xmm1, %xmm7
+; SSE-NEXT: pandn %xmm6, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-NEXT: pand %xmm1, %xmm4
+; SSE-NEXT: por %xmm4, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0]
+; SSE-NEXT: movdqa %xmm0, %xmm6
; SSE-NEXT: pandn %xmm4, %xmm6
-; SSE-NEXT: pand %xmm12, %xmm5
-; SSE-NEXT: por %xmm5, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2]
-; SSE-NEXT: movdqa %xmm14, %xmm3
+; SSE-NEXT: pand %xmm0, %xmm7
+; SSE-NEXT: por %xmm7, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0]
+; SSE-NEXT: movdqa %xmm7, %xmm3
; SSE-NEXT: pandn %xmm4, %xmm3
-; SSE-NEXT: pand %xmm14, %xmm6
+; SSE-NEXT: pand %xmm7, %xmm6
; SSE-NEXT: por %xmm6, %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[1,0,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: pandn %xmm4, %xmm5
-; SSE-NEXT: movdqa (%rdi), %xmm7
-; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,1,1]
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: por %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,0,0,0]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[3,3,3,3]
; SSE-NEXT: movdqa %xmm0, %xmm6
; SSE-NEXT: pandn %xmm4, %xmm6
-; SSE-NEXT: movdqa (%rsi), %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm0, %xmm5
-; SSE-NEXT: por %xmm5, %xmm6
-; SSE-NEXT: pand %xmm10, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT: movdqa %xmm10, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: por %xmm6, %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm1, %xmm4
-; SSE-NEXT: movdqa (%rdx), %xmm13
-; SSE-NEXT: movdqa (%rcx), %xmm3
+; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; SSE-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3]
+; SSE-NEXT: pand %xmm0, %xmm4
+; SSE-NEXT: por %xmm6, %xmm4
+; SSE-NEXT: pand %xmm1, %xmm4
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3]
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: pandn %xmm6, %xmm8
+; SSE-NEXT: por %xmm4, %xmm8
+; SSE-NEXT: pand %xmm12, %xmm8
+; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,2,3,3]
+; SSE-NEXT: movdqa %xmm12, %xmm3
+; SSE-NEXT: pandn %xmm6, %xmm3
+; SSE-NEXT: por %xmm8, %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm13, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,5,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: pandn %xmm6, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,2,2]
; SSE-NEXT: pand %xmm0, %xmm6
-; SSE-NEXT: por %xmm4, %xmm6
-; SSE-NEXT: movdqa (%r8), %xmm9
-; SSE-NEXT: movdqa %xmm9, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[2,2,3,3]
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: pandn %xmm11, %xmm3
-; SSE-NEXT: pand %xmm2, %xmm6
+; SSE-NEXT: por %xmm6, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2]
+; SSE-NEXT: movdqa %xmm11, %xmm9
+; SSE-NEXT: pandn %xmm6, %xmm9
+; SSE-NEXT: pand %xmm11, %xmm8
+; SSE-NEXT: por %xmm8, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2]
+; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: pandn %xmm6, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm9
+; SSE-NEXT: por %xmm9, %xmm3
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[1,0,2,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; SSE-NEXT: movdqa %xmm1, %xmm8
+; SSE-NEXT: pandn %xmm6, %xmm8
+; SSE-NEXT: movdqa (%rdi), %xmm13
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,1,1]
+; SSE-NEXT: pand %xmm1, %xmm6
+; SSE-NEXT: por %xmm6, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pandn %xmm2, %xmm6
+; SSE-NEXT: movdqa (%rsi), %xmm9
+; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm0, %xmm8
+; SSE-NEXT: por %xmm8, %xmm6
+; SSE-NEXT: pand %xmm7, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0]
+; SSE-NEXT: movdqa %xmm7, %xmm3
+; SSE-NEXT: pandn %xmm2, %xmm3
; SSE-NEXT: por %xmm6, %xmm3
-; SSE-NEXT: movdqa (%r9), %xmm5
-; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[2,2,3,3]
-; SSE-NEXT: movdqa %xmm15, %xmm5
-; SSE-NEXT: pandn %xmm11, %xmm5
-; SSE-NEXT: pand %xmm15, %xmm3
-; SSE-NEXT: por %xmm3, %xmm5
-; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pandn %xmm11, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,2,2]
-; SSE-NEXT: pand %xmm0, %xmm5
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,2,2]
-; SSE-NEXT: movdqa %xmm12, %xmm8
-; SSE-NEXT: pandn %xmm5, %xmm8
+; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm13, %xmm8
+; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,3,3,3]
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pandn %xmm2, %xmm6
+; SSE-NEXT: movdqa (%rdx), %xmm4
+; SSE-NEXT: movdqa (%rcx), %xmm2
+; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm4, %xmm14
+; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,5,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,3]
+; SSE-NEXT: pand %xmm0, %xmm9
+; SSE-NEXT: por %xmm6, %xmm9
+; SSE-NEXT: movdqa (%r8), %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm15
+; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[2,2,3,3]
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pandn %xmm6, %xmm3
+; SSE-NEXT: pand %xmm1, %xmm9
+; SSE-NEXT: por %xmm9, %xmm3
+; SSE-NEXT: movdqa (%r9), %xmm6
+; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,3,3]
+; SSE-NEXT: movdqa %xmm12, %xmm10
+; SSE-NEXT: pandn %xmm9, %xmm10
; SSE-NEXT: pand %xmm12, %xmm3
-; SSE-NEXT: por %xmm3, %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2]
-; SSE-NEXT: movdqa %xmm14, %xmm11
-; SSE-NEXT: pandn %xmm3, %xmm11
-; SSE-NEXT: pand %xmm14, %xmm8
-; SSE-NEXT: por %xmm8, %xmm11
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm1, %xmm4
+; SSE-NEXT: por %xmm3, %xmm10
+; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm0, %xmm9
+; SSE-NEXT: pandn %xmm3, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,2,2]
; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: por %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
-; SSE-NEXT: movdqa %xmm10, %xmm7
-; SSE-NEXT: pandn %xmm1, %xmm7
-; SSE-NEXT: pand %xmm10, %xmm4
-; SSE-NEXT: por %xmm4, %xmm7
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pandn %xmm1, %xmm3
+; SSE-NEXT: por %xmm3, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,2,2]
+; SSE-NEXT: movdqa %xmm11, %xmm10
+; SSE-NEXT: pandn %xmm3, %xmm10
+; SSE-NEXT: pand %xmm11, %xmm9
+; SSE-NEXT: por %xmm9, %xmm10
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2]
+; SSE-NEXT: movdqa %xmm5, %xmm9
+; SSE-NEXT: pandn %xmm3, %xmm9
+; SSE-NEXT: pand %xmm5, %xmm10
+; SSE-NEXT: por %xmm10, %xmm9
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[1,0,2,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; SSE-NEXT: movdqa %xmm1, %xmm10
+; SSE-NEXT: pandn %xmm3, %xmm10
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1]
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: por %xmm3, %xmm10
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,0,0,0]
+; SSE-NEXT: movdqa %xmm0, %xmm14
+; SSE-NEXT: pandn %xmm3, %xmm14
+; SSE-NEXT: pand %xmm0, %xmm10
+; SSE-NEXT: por %xmm10, %xmm14
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
+; SSE-NEXT: movdqa %xmm7, %xmm8
+; SSE-NEXT: pandn %xmm3, %xmm8
+; SSE-NEXT: pand %xmm7, %xmm14
+; SSE-NEXT: por %xmm14, %xmm8
; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
; SSE-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,3,3]
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm3, %xmm4
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: por %xmm1, %xmm4
-; SSE-NEXT: pand %xmm15, %xmm4
-; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3]
-; SSE-NEXT: pandn %xmm3, %xmm15
-; SSE-NEXT: por %xmm4, %xmm15
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pandn %xmm3, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,2,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[3,3,3,3]
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pandn %xmm3, %xmm6
+; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,3]
; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: por %xmm3, %xmm4
-; SSE-NEXT: pand %xmm12, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,2,2]
+; SSE-NEXT: por %xmm6, %xmm3
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3]
+; SSE-NEXT: movdqa %xmm1, %xmm10
+; SSE-NEXT: pandn %xmm6, %xmm10
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: por %xmm3, %xmm10
+; SSE-NEXT: pand %xmm12, %xmm10
+; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; SSE-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,2,3,3]
; SSE-NEXT: pandn %xmm3, %xmm12
-; SSE-NEXT: por %xmm4, %xmm12
-; SSE-NEXT: pand %xmm14, %xmm12
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
-; SSE-NEXT: pandn %xmm3, %xmm14
-; SSE-NEXT: por %xmm12, %xmm14
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,1,1]
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[1,0,2,2,4,5,6,7]
+; SSE-NEXT: por %xmm10, %xmm12
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
+; SSE-NEXT: movdqa %xmm0, %xmm10
+; SSE-NEXT: pandn %xmm3, %xmm10
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,2,2]
+; SSE-NEXT: pand %xmm0, %xmm3
+; SSE-NEXT: por %xmm3, %xmm10
+; SSE-NEXT: pand %xmm11, %xmm10
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,2,2]
+; SSE-NEXT: pandn %xmm3, %xmm11
+; SSE-NEXT: por %xmm10, %xmm11
+; SSE-NEXT: pand %xmm5, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2]
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: por %xmm11, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,1,1]
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,2,2,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; SSE-NEXT: pandn %xmm4, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,0,0]
-; SSE-NEXT: pandn %xmm3, %xmm0
-; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: pand %xmm10, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT: pandn %xmm1, %xmm10
-; SSE-NEXT: por %xmm0, %xmm10
+; SSE-NEXT: pandn %xmm4, %xmm1
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: pandn %xmm2, %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
+; SSE-NEXT: pandn %xmm1, %xmm7
+; SSE-NEXT: por %xmm0, %xmm7
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movdqa %xmm10, (%rax)
-; SSE-NEXT: movdqa %xmm14, 16(%rax)
-; SSE-NEXT: movdqa %xmm15, 32(%rax)
-; SSE-NEXT: movdqa %xmm7, 48(%rax)
-; SSE-NEXT: movdqa %xmm11, 64(%rax)
+; SSE-NEXT: movdqa %xmm7, (%rax)
+; SSE-NEXT: movdqa %xmm5, 16(%rax)
+; SSE-NEXT: movdqa %xmm12, 32(%rax)
+; SSE-NEXT: movdqa %xmm8, 48(%rax)
+; SSE-NEXT: movdqa %xmm9, 64(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 80(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -1011,184 +1007,183 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: movaps %xmm0, 112(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 128(%rax)
-; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 144(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, 144(%rax)
+; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 160(%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 176(%rax)
-; SSE-NEXT: addq $56, %rsp
+; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: retq
;
; AVX1-LABEL: store_i8_stride6_vf32:
; AVX1: # %bb.0:
-; AVX1-NEXT: pushq %rax
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,3,3,3]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX1-NEXT: vandnps %ymm1, %ymm7, %ymm2
+; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX1-NEXT: vandnps %ymm1, %ymm9, %ymm2
; AVX1-NEXT: vmovdqa 16(%rcx), %xmm1
; AVX1-NEXT: vmovdqa 16(%rdx), %xmm3
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,5,6,7,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,6,7,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vmovdqa 16(%r8), %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10],zero,xmm4[12,13,14,15]
-; AVX1-NEXT: vmovdqa 16(%r9), %xmm5
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm5[8],zero,zero,zero,zero,zero,xmm5[9],zero,zero,zero,zero
-; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa %xmm4, %xmm9
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7]
+; AVX1-NEXT: vandps %ymm3, %ymm9, %ymm3
+; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vmovdqa 16(%r8), %xmm12
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm12[8,u],zero,zero,zero,zero,xmm12[9,u],zero,zero,zero,zero
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10],zero,xmm3[12,13,14,15]
+; AVX1-NEXT: vmovdqa 16(%r9), %xmm14
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm14[8],zero,zero,zero,zero,zero,xmm14[9],zero,zero,zero,zero
; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm12[5,u],zero,zero,zero,zero,xmm12[6,u],zero,zero,zero,zero,xmm12[7,u]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm3, %xmm4
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm14[5],zero,zero,zero,zero,zero,xmm14[6],zero,zero,zero,zero,zero,xmm14[7]
+; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT: vandps %ymm7, %ymm0, %ymm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm0, %ymm9, %ymm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vandnps %ymm1, %ymm7, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vandnps %ymm1, %ymm9, %ymm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
-; AVX1-NEXT: vmovdqa %xmm9, %xmm4
-; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15]
-; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u>
+; AVX1-NEXT: vpshufb %xmm15, %xmm12, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm14[13],zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,xmm14[15]
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[10,u],zero,zero,zero,zero,xmm12[11,u],zero,zero,zero,zero,xmm12[12,u],zero,zero
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6],zero,xmm0[8,9,10,11,12],zero,xmm0[14,15]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm5[10],zero,zero,zero,zero,zero,xmm5[11],zero,zero,zero,zero,zero,xmm5[12],zero,zero
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128]
+; AVX1-NEXT: vpshufb %xmm8, %xmm14, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa (%rsi), %xmm10
-; AVX1-NEXT: vmovdqa (%rdi), %xmm9
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,2,2]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,3,3,3]
+; AVX1-NEXT: vmovdqa (%rsi), %xmm3
+; AVX1-NEXT: vmovdqa (%rdi), %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vmovdqa (%rcx), %xmm15
-; AVX1-NEXT: vmovdqa (%rdx), %xmm13
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vmovdqa (%rcx), %xmm7
+; AVX1-NEXT: vmovdqa (%rdx), %xmm6
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,5,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[2,2,2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm7, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm1, %ymm7, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm6
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,5,6,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm0, %ymm9, %ymm0
+; AVX1-NEXT: vandnps %ymm1, %ymm9, %ymm1
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm10
; AVX1-NEXT: vmovdqa (%r8), %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm1[13,u],zero,zero,zero,zero,xmm1[14,u],zero,zero,zero,zero,xmm1[15,u]
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7]
-; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm7
+; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm15
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6],xmm0[7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero
; AVX1-NEXT: vmovdqa (%r9), %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15]
-; AVX1-NEXT: vpor %xmm4, %xmm7, %xmm3
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
-; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero
-; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm3
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[0,0,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm3[1,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[1,0,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX1-NEXT: vandps %ymm7, %ymm4, %ymm4
-; AVX1-NEXT: vandnps %ymm6, %ymm7, %ymm6
-; AVX1-NEXT: vorps %ymm6, %ymm4, %ymm4
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5],xmm3[6],xmm6[7]
-; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm3
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[2],zero,zero,zero,zero,zero,xmm5[3],zero,zero,zero,zero,zero,xmm5[4],zero,zero
-; AVX1-NEXT: vpor %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,u],zero,zero,zero,zero,xmm2[1,u],zero,zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128]
-; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm3
-; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[1,0,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
-; AVX1-NEXT: vandps %ymm7, %ymm2, %ymm2
-; AVX1-NEXT: vandnps %ymm4, %ymm7, %ymm4
-; AVX1-NEXT: vorps %ymm4, %ymm2, %ymm4
+; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15]
+; AVX1-NEXT: vpor %xmm9, %xmm15, %xmm9
+; AVX1-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
+; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm8
+; AVX1-NEXT: vpor %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[1,0,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9
+; AVX1-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX1-NEXT: vandps %ymm13, %ymm8, %ymm8
+; AVX1-NEXT: vandnps %ymm9, %ymm13, %ymm9
+; AVX1-NEXT: vorps %ymm9, %ymm8, %ymm8
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm9
+; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[2,u],zero,zero,zero,zero,xmm12[3,u],zero,zero,zero,zero,xmm12[4,u],zero,zero
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1,2],xmm11[3],xmm9[4,5],xmm11[6],xmm9[7]
+; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
+; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm15
+; AVX1-NEXT: vpor %xmm15, %xmm9, %xmm9
+; AVX1-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128>
+; AVX1-NEXT: vpshufb %xmm15, %xmm12, %xmm12
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm12[2],xmm8[3,4],xmm12[5],xmm8[6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm10, %xmm8, %xmm8
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm9, %xmm14, %xmm12
+; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm12
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[1,0,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
+; AVX1-NEXT: vandps %ymm2, %ymm13, %ymm2
+; AVX1-NEXT: vandnps %ymm7, %ymm13, %ymm7
+; AVX1-NEXT: vorps %ymm7, %ymm2, %ymm7
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[2,u],zero,zero,zero,zero,xmm1[3,u],zero,zero,zero,zero,xmm1[4,u],zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5],xmm2[6],xmm6[7]
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6],zero,xmm2[8,9,10,11,12],zero,xmm2[14,15]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[2],zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero
-; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7]
-; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm14, %xmm0, %xmm6
-; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,0,1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[1,0,2,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX1-NEXT: vandnps %ymm5, %ymm6, %ymm5
-; AVX1-NEXT: vandps %ymm6, %ymm3, %ymm3
-; AVX1-NEXT: vorps %ymm5, %ymm3, %ymm3
-; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
-; AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero
-; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm8
+; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm8
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
+; AVX1-NEXT: vpshufb %xmm10, %xmm7, %xmm7
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm8
+; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,5,6,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX1-NEXT: vandnps %ymm3, %ymm5, %ymm3
+; AVX1-NEXT: vandps %ymm5, %ymm4, %ymm4
+; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
+; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero
+; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8],zero,xmm1[10,11,12,13,14],zero
@@ -1196,11 +1191,10 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX1-NEXT: vmovdqa %xmm5, 48(%rax)
-; AVX1-NEXT: vmovdqa %xmm4, (%rax)
+; AVX1-NEXT: vmovdqa %xmm4, 48(%rax)
+; AVX1-NEXT: vmovdqa %xmm7, (%rax)
; AVX1-NEXT: vmovdqa %xmm2, 16(%rax)
-; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vmovaps %xmm0, 96(%rax)
+; AVX1-NEXT: vmovdqa %xmm12, 96(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 112(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -1215,388 +1209,372 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-NEXT: vmovaps %xmm0, 128(%rax)
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: vmovaps %xmm0, 144(%rax)
-; AVX1-NEXT: popq %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: store_i8_stride6_vf32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: subq $72, %rsp
-; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0
-; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4
-; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm8
-; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm15
-; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm7
-; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm1
-; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm6
-; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm12
-; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0
-; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11
+; AVX2-SLOW-NEXT: subq $40, %rsp
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2
+; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4
+; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6
+; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm3
+; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm8
+; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5
+; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm7
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1]
+; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm7
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
-; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm2
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm5
-; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm3
-; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
-; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm9
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm11
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9
+; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm10
+; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm10
+; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm8
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm14
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm10
-; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2
-; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[4],ymm10[4],ymm2[5],ymm10[5],ymm2[6],ymm10[6],ymm2[7],ymm10[7],ymm2[16],ymm10[16],ymm2[17],ymm10[17],ymm2[18],ymm10[18],ymm2[19],ymm10[19],ymm2[20],ymm10[20],ymm2[21],ymm10[21],ymm2[22],ymm10[22],ymm2[23],ymm10[23]
+; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm12
+; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm4, %ymm10
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm4, %ymm15
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm15
; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm10
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23]
; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm10
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm3
-; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm8
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u]
+; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11
+; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm15, %ymm13
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
+; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm3, %ymm2
-; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm3
+; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm2
-; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
-; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm13
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm3
+; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm13
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
-; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm14
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm14
; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6
-; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm7
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm4, %ymm1
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm4, %ymm4
+; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u]
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm3
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11
+; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm14, %ymm3
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm14, %ymm4
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm11
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14
-; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
-; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15]
+; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm14, %ymm14
+; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
-; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm5, %ymm1
-; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15],ymm6[24],ymm7[24],ymm6[25],ymm7[25],ymm6[26],ymm7[26],ymm6[27],ymm7[27],ymm6[28],ymm7[28],ymm6[29],ymm7[29],ymm6[30],ymm7[30],ymm6[31],ymm7[31]
-; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm7, %ymm5
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm7, %ymm5, %ymm5
+; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15],ymm6[24],ymm4[24],ymm6[25],ymm4[25],ymm6[26],ymm4[26],ymm6[27],ymm4[27],ymm6[28],ymm4[28],ymm6[29],ymm4[29],ymm6[30],ymm4[30],ymm6[31],ymm4[31]
+; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
+; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax)
-; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax)
-; AVX2-SLOW-NEXT: vmovdqa %ymm14, 96(%rax)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax)
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm14, 96(%rax)
+; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax)
-; AVX2-SLOW-NEXT: addq $72, %rsp
+; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%rax)
+; AVX2-SLOW-NEXT: addq $40, %rsp
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: store_i8_stride6_vf32:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: subq $40, %rsp
-; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0
-; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm15
-; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7
-; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm8
-; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm4
-; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm1
-; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm6
-; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12
-; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0
-; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3
+; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2
+; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4
+; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5
+; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8
+; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6
+; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7
+; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1]
+; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
-; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm2
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5
-; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm3
-; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
-; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm11
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm9
+; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm10
+; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm10
+; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm14
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm10
-; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2
-; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm1
-; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[4],ymm10[4],ymm2[5],ymm10[5],ymm2[6],ymm10[6],ymm2[7],ymm10[7],ymm2[16],ymm10[16],ymm2[17],ymm10[17],ymm2[18],ymm10[18],ymm2[19],ymm10[19],ymm2[20],ymm10[20],ymm2[21],ymm10[21],ymm2[22],ymm10[22],ymm2[23],ymm10[23]
+; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm12
+; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm10
+; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm15
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm10
+; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm15
+; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm10
; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23]
; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm10
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
-; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm3
-; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm8
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u]
+; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11
+; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
-; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm15, %ymm13
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
+; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm3, %ymm2
-; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm5
+; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm2
-; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
-; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
-; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm13
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm5
+; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
+; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm13
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
-; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm14
-; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6
-; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm7[8],ymm0[8],ymm7[9],ymm0[9],ymm7[10],ymm0[10],ymm7[11],ymm0[11],ymm7[12],ymm0[12],ymm7[13],ymm0[13],ymm7[14],ymm0[14],ymm7[15],ymm0[15],ymm7[24],ymm0[24],ymm7[25],ymm0[25],ymm7[26],ymm0[26],ymm7[27],ymm0[27],ymm7[28],ymm0[28],ymm7[29],ymm0[29],ymm7[30],ymm0[30],ymm7[31],ymm0[31]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm14
+; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm5
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
-; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm4, %ymm4
-; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u]
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11
+; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm5
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm14, %ymm4
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm11
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14
-; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
-; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm14
+; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
-; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm5, %ymm1
-; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23]
-; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm5, %ymm7, %ymm5
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
+; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
+; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax)
; AVX2-FAST-NEXT: vmovdqa %ymm14, 160(%rax)
-; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax)
-; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax)
-; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%rax)
-; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax)
+; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax)
+; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%rax)
+; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX2-FAST-NEXT: addq $40, %rsp
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: store_i8_stride6_vf32:
; AVX512: # %bb.0:
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa (%rdi), %ymm12
-; AVX512-NEXT: vmovdqa (%rsi), %ymm13
-; AVX512-NEXT: vmovdqa (%rdx), %ymm10
-; AVX512-NEXT: vmovdqa (%rcx), %ymm11
-; AVX512-NEXT: vmovdqa (%r8), %ymm8
-; AVX512-NEXT: vmovdqa (%r9), %ymm9
-; AVX512-NEXT: vmovdqa (%rsi), %xmm14
+; AVX512-NEXT: vmovdqa (%rdi), %ymm4
+; AVX512-NEXT: vmovdqa (%rsi), %ymm5
+; AVX512-NEXT: vmovdqa (%rdx), %ymm2
+; AVX512-NEXT: vmovdqa (%rcx), %ymm3
+; AVX512-NEXT: vmovdqa (%r8), %ymm0
+; AVX512-NEXT: vmovdqa (%r9), %ymm1
+; AVX512-NEXT: vmovdqa (%rsi), %xmm7
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
-; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm0
-; AVX512-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; AVX512-NEXT: vmovdqa (%rcx), %xmm6
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
-; AVX512-NEXT: vmovdqa (%rdx), %xmm4
-; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
+; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm8
+; AVX512-NEXT: vmovdqa (%rdi), %xmm9
+; AVX512-NEXT: vpshufb %xmm6, %xmm9, %xmm6
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
+; AVX512-NEXT: vmovdqa (%rcx), %xmm8
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm10, %xmm8, %xmm11
+; AVX512-NEXT: vmovdqa (%rdx), %xmm12
+; AVX512-NEXT: vpshufb %xmm10, %xmm12, %xmm10
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
; AVX512-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512-NEXT: kmovd %ecx, %k1
-; AVX512-NEXT: vmovdqu16 %ymm0, %ymm2 {%k1}
-; AVX512-NEXT: vmovdqa (%r9), %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm5
-; AVX512-NEXT: vmovdqa (%r8), %xmm7
-; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm0
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX512-NEXT: vmovdqu16 %ymm6, %ymm10 {%k1}
+; AVX512-NEXT: vmovdqa (%r9), %xmm11
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm6, %xmm11, %xmm13
+; AVX512-NEXT: vmovdqa (%r8), %xmm14
+; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm6
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
; AVX512-NEXT: movw $9362, %cx # imm = 0x2492
; AVX512-NEXT: kmovd %ecx, %k2
-; AVX512-NEXT: vmovdqu16 %ymm0, %ymm2 {%k2}
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5]
-; AVX512-NEXT: vpermw %ymm0, %ymm5, %ymm0
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; AVX512-NEXT: vprold $16, %xmm5, %xmm5
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
-; AVX512-NEXT: vmovdqu16 %ymm5, %ymm0 {%k2}
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512-NEXT: vmovdqu16 %ymm6, %ymm10 {%k2}
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5]
+; AVX512-NEXT: vpermw %ymm6, %ymm13, %ymm6
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; AVX512-NEXT: vprold $16, %xmm13, %xmm13
+; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1]
+; AVX512-NEXT: vmovdqu16 %ymm13, %ymm6 {%k2}
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4]
-; AVX512-NEXT: vpermw %ymm5, %ymm15, %ymm0 {%k1}
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[16],ymm13[16],ymm12[17],ymm13[17],ymm12[18],ymm13[18],ymm12[19],ymm13[19],ymm12[20],ymm13[20],ymm12[21],ymm13[21],ymm12[22],ymm13[22],ymm12[23],ymm13[23]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13]
-; AVX512-NEXT: vpermw %ymm2, %ymm5, %ymm2
-; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[16],ymm11[16],ymm10[17],ymm11[17],ymm10[18],ymm11[18],ymm10[19],ymm11[19],ymm10[20],ymm11[20],ymm10[21],ymm11[21],ymm10[22],ymm11[22],ymm10[23],ymm11[23]
-; AVX512-NEXT: vprold $16, %ymm5, %ymm5
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512-NEXT: vmovdqu16 %ymm5, %ymm2 {%k2}
-; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[16],ymm9[16],ymm8[17],ymm9[17],ymm8[18],ymm9[18],ymm8[19],ymm9[19],ymm8[20],ymm9[20],ymm8[21],ymm9[21],ymm8[22],ymm9[22],ymm8[23],ymm9[23]
+; AVX512-NEXT: vpermw %ymm13, %ymm15, %ymm6 {%k1}
+; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6
+; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13]
+; AVX512-NEXT: vpermw %ymm10, %ymm13, %ymm10
+; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
+; AVX512-NEXT: vprold $16, %ymm13, %ymm13
+; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
+; AVX512-NEXT: vmovdqu16 %ymm13, %ymm10 {%k2}
+; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [10,9,8,11,10,9,8,11,10,9,8,11,12,12,12,12]
-; AVX512-NEXT: vpermw %ymm5, %ymm15, %ymm2 {%k1}
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7]
-; AVX512-NEXT: vpermw %ymm1, %ymm5, %ymm1
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7]
-; AVX512-NEXT: vpermw %ymm4, %ymm5, %ymm1 {%k1}
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
+; AVX512-NEXT: vpermw %ymm13, %ymm15, %ymm10 {%k1}
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7]
+; AVX512-NEXT: vpermw %ymm7, %ymm9, %ymm7
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7]
+; AVX512-NEXT: vpermw %ymm8, %ymm9, %ymm7 {%k1}
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
; AVX512-NEXT: movw $-28087, %cx # imm = 0x9249
; AVX512-NEXT: kmovd %ecx, %k3
-; AVX512-NEXT: vpermw %ymm3, %ymm4, %ymm1 {%k3}
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15],ymm10[24],ymm11[24],ymm10[25],ymm11[25],ymm10[26],ymm11[26],ymm10[27],ymm11[27],ymm10[28],ymm11[28],ymm10[29],ymm11[29],ymm10[30],ymm11[30],ymm10[31],ymm11[31]
-; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15],ymm12[24],ymm13[24],ymm12[25],ymm13[25],ymm12[26],ymm13[26],ymm12[27],ymm13[27],ymm12[28],ymm13[28],ymm12[29],ymm13[29],ymm12[30],ymm13[30],ymm12[31],ymm13[31]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512-NEXT: vpermw %ymm3, %ymm4, %ymm3
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512-NEXT: vpermw %ymm2, %ymm4, %ymm3 {%k1}
-; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15],ymm8[24],ymm9[24],ymm8[25],ymm9[25],ymm8[26],ymm9[26],ymm8[27],ymm9[27],ymm8[28],ymm9[28],ymm8[29],ymm9[29],ymm8[30],ymm9[30],ymm8[31],ymm9[31]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15]
-; AVX512-NEXT: vpermw %ymm2, %ymm4, %ymm3 {%k3}
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %ymm2, %ymm13, %ymm4
-; AVX512-NEXT: vpshufb %ymm2, %ymm12, %ymm2
-; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %ymm4, %ymm11, %ymm5
-; AVX512-NEXT: vpshufb %ymm4, %ymm10, %ymm4
+; AVX512-NEXT: vpermw %ymm8, %ymm9, %ymm7 {%k3}
+; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7
+; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512-NEXT: vpermw %ymm9, %ymm10, %ymm9
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k1}
+; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15]
+; AVX512-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k3}
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm8, %ymm5, %ymm5
+; AVX512-NEXT: vpshufb %ymm8, %ymm4, %ymm4
; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512-NEXT: vmovdqu16 %ymm2, %ymm4 {%k1}
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %ymm2, %ymm9, %ymm5
-; AVX512-NEXT: vpshufb %ymm2, %ymm8, %ymm2
-; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[16],ymm5[16],ymm2[17],ymm5[17],ymm2[18],ymm5[18],ymm2[19],ymm5[19],ymm2[20],ymm5[20],ymm2[21],ymm5[21],ymm2[22],ymm5[22],ymm2[23],ymm5[23]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm3
+; AVX512-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512-NEXT: vmovdqu16 %ymm2, %ymm4 {%k2}
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm2
-; AVX512-NEXT: vmovdqu64 %zmm2, 128(%rax)
-; AVX512-NEXT: vmovdqu64 %zmm1, 64(%rax)
-; AVX512-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1}
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512-NEXT: vmovdqu16 %ymm0, %ymm2 {%k2}
+; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rax)
+; AVX512-NEXT: vmovdqu64 %zmm7, 64(%rax)
+; AVX512-NEXT: vmovdqu64 %zmm6, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 32
diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
index 505d9c8463c1a..008188b52c200 100644
--- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
+++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
@@ -51,48 +51,48 @@ define <4 x i16> @smulfixsat(<4 x i16> %a) {
; CHECK-NEXT: shldw $1, %cx, %dx
; CHECK-NEXT: sarl $16, %ecx
; CHECK-NEXT: cmpl $16384, %ecx # imm = 0x4000
-; CHECK-NEXT: movl $32767, %r8d # imm = 0x7FFF
-; CHECK-NEXT: cmovgel %r8d, %edx
+; CHECK-NEXT: movl $32767, %eax # imm = 0x7FFF
+; CHECK-NEXT: cmovgel %eax, %edx
; CHECK-NEXT: cmpl $-16384, %ecx # imm = 0xC000
; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000
; CHECK-NEXT: cmovll %ecx, %edx
; CHECK-NEXT: pextrw $1, %xmm0, %esi
; CHECK-NEXT: leal (%rsi,%rsi), %edi
-; CHECK-NEXT: movswl %si, %eax
-; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: movswl %si, %r8d
+; CHECK-NEXT: movl %r8d, %esi
; CHECK-NEXT: shrl $16, %esi
; CHECK-NEXT: shldw $1, %di, %si
-; CHECK-NEXT: sarl $16, %eax
-; CHECK-NEXT: cmpl $16384, %eax # imm = 0x4000
-; CHECK-NEXT: cmovgel %r8d, %esi
-; CHECK-NEXT: cmpl $-16384, %eax # imm = 0xC000
+; CHECK-NEXT: sarl $16, %r8d
+; CHECK-NEXT: cmpl $16384, %r8d # imm = 0x4000
+; CHECK-NEXT: cmovgel %eax, %esi
+; CHECK-NEXT: cmpl $-16384, %r8d # imm = 0xC000
; CHECK-NEXT: cmovll %ecx, %esi
-; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: cwtl
-; CHECK-NEXT: movl %eax, %edi
-; CHECK-NEXT: shrl $16, %edi
-; CHECK-NEXT: shldw $1, %ax, %di
-; CHECK-NEXT: sarl $16, %eax
-; CHECK-NEXT: cmpl $16384, %eax # imm = 0x4000
-; CHECK-NEXT: cmovgel %r8d, %edi
-; CHECK-NEXT: cmpl $-16384, %eax # imm = 0xC000
-; CHECK-NEXT: cmovll %ecx, %edi
-; CHECK-NEXT: movzwl %di, %eax
-; CHECK-NEXT: movd %eax, %xmm1
+; CHECK-NEXT: movd %xmm0, %edi
+; CHECK-NEXT: movswl %di, %edi
+; CHECK-NEXT: movl %edi, %r8d
+; CHECK-NEXT: shrl $16, %r8d
+; CHECK-NEXT: shldw $1, %di, %r8w
+; CHECK-NEXT: sarl $16, %edi
+; CHECK-NEXT: cmpl $16384, %edi # imm = 0x4000
+; CHECK-NEXT: cmovgel %eax, %r8d
+; CHECK-NEXT: cmpl $-16384, %edi # imm = 0xC000
+; CHECK-NEXT: cmovll %ecx, %r8d
+; CHECK-NEXT: movzwl %r8w, %edi
+; CHECK-NEXT: movd %edi, %xmm1
; CHECK-NEXT: pinsrw $1, %esi, %xmm1
; CHECK-NEXT: pinsrw $2, %edx, %xmm1
-; CHECK-NEXT: pextrw $3, %xmm0, %eax
-; CHECK-NEXT: cwtl
-; CHECK-NEXT: leal (,%rax,4), %edx
-; CHECK-NEXT: movl %edx, %esi
-; CHECK-NEXT: shrl $16, %esi
-; CHECK-NEXT: shldw $1, %dx, %si
-; CHECK-NEXT: sarl $14, %eax
-; CHECK-NEXT: cmpl $16384, %eax # imm = 0x4000
-; CHECK-NEXT: cmovgel %r8d, %esi
-; CHECK-NEXT: cmpl $-16384, %eax # imm = 0xC000
-; CHECK-NEXT: cmovll %ecx, %esi
-; CHECK-NEXT: pinsrw $3, %esi, %xmm1
+; CHECK-NEXT: pextrw $3, %xmm0, %edx
+; CHECK-NEXT: movswl %dx, %edx
+; CHECK-NEXT: leal (,%rdx,4), %esi
+; CHECK-NEXT: movl %esi, %edi
+; CHECK-NEXT: shrl $16, %edi
+; CHECK-NEXT: shldw $1, %si, %di
+; CHECK-NEXT: sarl $14, %edx
+; CHECK-NEXT: cmpl $16384, %edx # imm = 0x4000
+; CHECK-NEXT: cmovgel %eax, %edi
+; CHECK-NEXT: cmpl $-16384, %edx # imm = 0xC000
+; CHECK-NEXT: cmovll %ecx, %edi
+; CHECK-NEXT: pinsrw $3, %edi, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%t = call <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> %a, i32 15)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll
index 5e001758eea45..f9972bf69cb22 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll
@@ -201,45 +201,45 @@ define i64 @test_v8i64_v8i8(<8 x i8> %a0) {
define i64 @test_v16i64_v16i8(<16 x i8> %a0) {
; SSE2-LABEL: test_v16i64_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pxor %xmm12, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm12
-; SSE2-NEXT: movdqa %xmm1, %xmm11
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
; SSE2-NEXT: psrad $24, %xmm0
; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE2-NEXT: movdqa %xmm0, %xmm13
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm7[2],xmm13[3],xmm7[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrad $24, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: movdqa %xmm4, %xmm10
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $24, %xmm6
-; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE2-NEXT: psrad $24, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm10
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
-; SSE2-NEXT: paddq %xmm5, %xmm4
-; SSE2-NEXT: paddq %xmm11, %xmm4
-; SSE2-NEXT: paddq %xmm13, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
+; SSE2-NEXT: movdqa %xmm6, %xmm11
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3]
+; SSE2-NEXT: paddq %xmm10, %xmm11
+; SSE2-NEXT: paddq %xmm5, %xmm11
+; SSE2-NEXT: paddq %xmm8, %xmm11
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1]
-; SSE2-NEXT: paddq %xmm6, %xmm3
-; SSE2-NEXT: paddq %xmm1, %xmm3
-; SSE2-NEXT: paddq %xmm4, %xmm3
-; SSE2-NEXT: paddq %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
-; SSE2-NEXT: paddq %xmm3, %xmm0
+; SSE2-NEXT: paddq %xmm4, %xmm6
+; SSE2-NEXT: paddq %xmm1, %xmm6
+; SSE2-NEXT: paddq %xmm11, %xmm6
+; SSE2-NEXT: paddq %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE2-NEXT: paddq %xmm6, %xmm0
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
@@ -1976,11 +1976,11 @@ define i8 @test_v128i8_v128i1(<128 x i8> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm5
-; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm8
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm6
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm7
-; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm6
-; AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm8
+; AVX1-NEXT: vpaddb %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpaddb %xmm7, %xmm5, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1993,7 +1993,7 @@ define i8 @test_v128i8_v128i1(<128 x i8> %a0) {
; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm8, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm6, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
index 3b4825713f225..14884728af162 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
@@ -302,24 +302,24 @@ define float @test_v8f32(<8 x float> %a0) {
; AVX-LABEL: test_v8f32:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; AVX-NEXT: vmaxss %xmm0, %xmm7, %xmm2
+; AVX-NEXT: vmaxss %xmm0, %xmm7, %xmm8
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm2, %xmm0
-; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm8, %xmm0
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm7
; AVX-NEXT: vmaxss %xmm0, %xmm6, %xmm0
-; AVX-NEXT: vblendvps %xmm2, %xmm6, %xmm0, %xmm0
-; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm7, %xmm6, %xmm0, %xmm0
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm6
; AVX-NEXT: vmaxss %xmm0, %xmm5, %xmm0
-; AVX-NEXT: vblendvps %xmm2, %xmm5, %xmm0, %xmm0
-; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm6, %xmm5, %xmm0, %xmm0
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm5
; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vblendvps %xmm5, %xmm1, %xmm0, %xmm0
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1
; AVX-NEXT: vmaxss %xmm0, %xmm4, %xmm0
; AVX-NEXT: vblendvps %xmm1, %xmm4, %xmm0, %xmm0
@@ -327,25 +327,25 @@ define float @test_v8f32(<8 x float> %a0) {
; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm0
; AVX-NEXT: vblendvps %xmm1, %xmm3, %xmm0, %xmm0
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1
-; AVX-NEXT: vmaxss %xmm0, %xmm8, %xmm0
-; AVX-NEXT: vblendvps %xmm1, %xmm8, %xmm0, %xmm0
+; AVX-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512BW-LABEL: test_v8f32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3]
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3]
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm1
+; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm8
; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512BW-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512BW-NEXT: vmaxss %xmm1, %xmm6, %xmm0
+; AVX512BW-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1}
+; AVX512BW-NEXT: vcmpunordss %xmm8, %xmm8, %k1
+; AVX512BW-NEXT: vmaxss %xmm8, %xmm6, %xmm0
; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vmaxss %xmm0, %xmm5, %xmm0
@@ -360,25 +360,25 @@ define float @test_v8f32(<8 x float> %a0) {
; AVX512BW-NEXT: vmaxss %xmm0, %xmm2, %xmm0
; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512BW-NEXT: vmaxss %xmm0, %xmm8, %xmm0
-; AVX512BW-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_v8f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3]
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm2
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm8
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vmaxss %xmm2, %xmm6, %xmm0
+; AVX512VL-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm8, %xmm8, %k1
+; AVX512VL-NEXT: vmaxss %xmm8, %xmm6, %xmm0
; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0
@@ -393,8 +393,8 @@ define float @test_v8f32(<8 x float> %a0) {
; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0
; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0
-; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0)
@@ -596,65 +596,65 @@ define float @test_v16f32(<16 x float> %a0) {
; AVX512VL-LABEL: test_v16f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3]
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm3[1,0]
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm10 = xmm3[1,1,3,3]
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3]
+; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3]
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm12 = xmm6[1,0]
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm6[1,1,3,3]
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[3,3,3,3]
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm15 = xmm2[1,0]
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm2[1,1,3,3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm7
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmovss %xmm5, %xmm7, %xmm7 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm7, %xmm7, %k1
-; AVX512VL-NEXT: vmaxss %xmm7, %xmm4, %xmm0
-; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3]
+; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0]
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[3,3,3,3]
+; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0]
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3]
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0]
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3]
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm15, %xmm16
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0
-; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm16, %xmm0
-; AVX512VL-NEXT: vmovss %xmm16, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm15, %xmm0
-; AVX512VL-NEXT: vmovss %xmm15, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm14, %xmm0
+; AVX512VL-NEXT: vmovss %xmm15, %xmm16, %xmm16 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm16, %xmm16, %k1
+; AVX512VL-NEXT: vmaxss %xmm16, %xmm14, %xmm0
; AVX512VL-NEXT: vmovss %xmm14, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm6, %xmm0
-; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmaxss %xmm0, %xmm13, %xmm0
; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm9, %xmm0
+; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmaxss %xmm0, %xmm12, %xmm0
; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmaxss %xmm0, %xmm11, %xmm0
; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0
-; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmaxss %xmm0, %xmm10, %xmm0
; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm9, %xmm0
-; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm6, %xmm0
+; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0
; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm0
+; AVX512VL-NEXT: vmovss %xmm7, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0
+; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm4, %xmm0
+; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0)
@@ -856,17 +856,17 @@ define double @test_v8f64(<8 x double> %a0) {
; AVX512BW-LABEL: test_v8f64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm2[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
-; AVX512BW-NEXT: vmaxsd %xmm0, %xmm7, %xmm1
+; AVX512BW-NEXT: vmaxsd %xmm0, %xmm7, %xmm8
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512BW-NEXT: vmovsd %xmm7, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
-; AVX512BW-NEXT: vmaxsd %xmm1, %xmm5, %xmm0
+; AVX512BW-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1}
+; AVX512BW-NEXT: vcmpunordsd %xmm8, %xmm8, %k1
+; AVX512BW-NEXT: vmaxsd %xmm8, %xmm5, %xmm0
; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vmaxsd %xmm0, %xmm6, %xmm0
@@ -881,25 +881,25 @@ define double @test_v8f64(<8 x double> %a0) {
; AVX512BW-NEXT: vmaxsd %xmm0, %xmm2, %xmm0
; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512BW-NEXT: vmaxsd %xmm0, %xmm8, %xmm0
-; AVX512BW-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_v8f64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0]
+; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
-; AVX512VL-NEXT: vmaxsd %xmm0, %xmm7, %xmm2
+; AVX512VL-NEXT: vmaxsd %xmm0, %xmm7, %xmm8
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmovsd %xmm7, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vcmpunordsd %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vmaxsd %xmm2, %xmm5, %xmm0
+; AVX512VL-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1}
+; AVX512VL-NEXT: vcmpunordsd %xmm8, %xmm8, %k1
+; AVX512VL-NEXT: vmaxsd %xmm8, %xmm5, %xmm0
; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmaxsd %xmm0, %xmm6, %xmm0
@@ -914,8 +914,8 @@ define double @test_v8f64(<8 x double> %a0) {
; AVX512VL-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmaxsd %xmm0, %xmm8, %xmm0
-; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vmaxsd %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a0)
@@ -980,41 +980,40 @@ define double @test_v16f64(<16 x double> %a0) {
;
; SSE41-LABEL: test_v16f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movapd %xmm3, %xmm8
-; SSE41-NEXT: movapd %xmm4, %xmm3
-; SSE41-NEXT: maxpd %xmm0, %xmm3
+; SSE41-NEXT: movapd %xmm4, %xmm8
+; SSE41-NEXT: maxpd %xmm0, %xmm8
; SSE41-NEXT: cmpunordpd %xmm0, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
; SSE41-NEXT: movapd %xmm6, %xmm4
; SSE41-NEXT: maxpd %xmm2, %xmm4
; SSE41-NEXT: cmpunordpd %xmm2, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm2
-; SSE41-NEXT: maxpd %xmm3, %xmm2
-; SSE41-NEXT: cmpunordpd %xmm3, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: maxpd %xmm8, %xmm2
+; SSE41-NEXT: cmpunordpd %xmm8, %xmm8
+; SSE41-NEXT: movapd %xmm8, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: movapd %xmm5, %xmm3
-; SSE41-NEXT: maxpd %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm5, %xmm4
+; SSE41-NEXT: maxpd %xmm1, %xmm4
; SSE41-NEXT: cmpunordpd %xmm1, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
; SSE41-NEXT: movapd %xmm7, %xmm1
-; SSE41-NEXT: maxpd %xmm8, %xmm1
-; SSE41-NEXT: cmpunordpd %xmm8, %xmm8
-; SSE41-NEXT: movapd %xmm8, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: maxpd %xmm3, %xmm4
+; SSE41-NEXT: maxpd %xmm3, %xmm1
; SSE41-NEXT: cmpunordpd %xmm3, %xmm3
; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: maxpd %xmm4, %xmm3
+; SSE41-NEXT: cmpunordpd %xmm4, %xmm4
+; SSE41-NEXT: movapd %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm1
; SSE41-NEXT: maxpd %xmm2, %xmm1
; SSE41-NEXT: cmpunordpd %xmm2, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE41-NEXT: movapd %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
index 8201072fb284d..840dfcbdbb394 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
@@ -235,24 +235,24 @@ define float @test_v8f32(<8 x float> %a0) {
; AVX-LABEL: test_v8f32:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; AVX-NEXT: vminss %xmm0, %xmm7, %xmm2
+; AVX-NEXT: vminss %xmm0, %xmm7, %xmm8
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm2, %xmm0
-; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm8, %xmm0
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm7
; AVX-NEXT: vminss %xmm0, %xmm6, %xmm0
-; AVX-NEXT: vblendvps %xmm2, %xmm6, %xmm0, %xmm0
-; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm7, %xmm6, %xmm0, %xmm0
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm6
; AVX-NEXT: vminss %xmm0, %xmm5, %xmm0
-; AVX-NEXT: vblendvps %xmm2, %xmm5, %xmm0, %xmm0
-; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm6, %xmm5, %xmm0, %xmm0
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm5
; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vblendvps %xmm5, %xmm1, %xmm0, %xmm0
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1
; AVX-NEXT: vminss %xmm0, %xmm4, %xmm0
; AVX-NEXT: vblendvps %xmm1, %xmm4, %xmm0, %xmm0
@@ -260,25 +260,25 @@ define float @test_v8f32(<8 x float> %a0) {
; AVX-NEXT: vminss %xmm0, %xmm3, %xmm0
; AVX-NEXT: vblendvps %xmm1, %xmm3, %xmm0, %xmm0
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1
-; AVX-NEXT: vminss %xmm0, %xmm8, %xmm0
-; AVX-NEXT: vblendvps %xmm1, %xmm8, %xmm0, %xmm0
+; AVX-NEXT: vminss %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512BW-LABEL: test_v8f32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3]
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3]
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm1
+; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm8
; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512BW-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512BW-NEXT: vminss %xmm1, %xmm6, %xmm0
+; AVX512BW-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1}
+; AVX512BW-NEXT: vcmpunordss %xmm8, %xmm8, %k1
+; AVX512BW-NEXT: vminss %xmm8, %xmm6, %xmm0
; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vminss %xmm0, %xmm5, %xmm0
@@ -293,25 +293,25 @@ define float @test_v8f32(<8 x float> %a0) {
; AVX512BW-NEXT: vminss %xmm0, %xmm2, %xmm0
; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512BW-NEXT: vminss %xmm0, %xmm8, %xmm0
-; AVX512BW-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vminss %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_v8f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3]
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm2
+; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm8
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vminss %xmm2, %xmm6, %xmm0
+; AVX512VL-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm8, %xmm8, %k1
+; AVX512VL-NEXT: vminss %xmm8, %xmm6, %xmm0
; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0
@@ -326,8 +326,8 @@ define float @test_v8f32(<8 x float> %a0) {
; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0
; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0
-; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0)
@@ -529,65 +529,65 @@ define float @test_v16f32(<16 x float> %a0) {
; AVX512VL-LABEL: test_v16f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3]
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm3[1,0]
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm10 = xmm3[1,1,3,3]
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3]
+; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3]
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm12 = xmm6[1,0]
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm6[1,1,3,3]
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[3,3,3,3]
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm15 = xmm2[1,0]
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm2[1,1,3,3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm7
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmovss %xmm5, %xmm7, %xmm7 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm7, %xmm7, %k1
-; AVX512VL-NEXT: vminss %xmm7, %xmm4, %xmm0
-; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3]
+; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0]
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[3,3,3,3]
+; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0]
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3]
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0]
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3]
+; AVX512VL-NEXT: vminss %xmm0, %xmm15, %xmm16
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0
-; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vminss %xmm0, %xmm16, %xmm0
-; AVX512VL-NEXT: vmovss %xmm16, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vminss %xmm0, %xmm15, %xmm0
-; AVX512VL-NEXT: vmovss %xmm15, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vminss %xmm0, %xmm14, %xmm0
+; AVX512VL-NEXT: vmovss %xmm15, %xmm16, %xmm16 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm16, %xmm16, %k1
+; AVX512VL-NEXT: vminss %xmm16, %xmm14, %xmm0
; AVX512VL-NEXT: vmovss %xmm14, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vminss %xmm0, %xmm6, %xmm0
-; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vminss %xmm0, %xmm13, %xmm0
; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vminss %xmm0, %xmm9, %xmm0
+; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vminss %xmm0, %xmm12, %xmm0
; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vminss %xmm0, %xmm11, %xmm0
; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0
-; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vminss %xmm0, %xmm10, %xmm0
; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vminss %xmm0, %xmm9, %xmm0
-; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vminss %xmm0, %xmm6, %xmm0
+; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0
; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm0
+; AVX512VL-NEXT: vmovss %xmm7, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0
+; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vminss %xmm0, %xmm4, %xmm0
+; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a0)
@@ -859,17 +859,17 @@ define double @test_v8f64(<8 x double> %a0) {
; AVX512BW-LABEL: test_v8f64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm2[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
-; AVX512BW-NEXT: vminsd %xmm0, %xmm7, %xmm1
+; AVX512BW-NEXT: vminsd %xmm0, %xmm7, %xmm8
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512BW-NEXT: vmovsd %xmm7, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
-; AVX512BW-NEXT: vminsd %xmm1, %xmm5, %xmm0
+; AVX512BW-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1}
+; AVX512BW-NEXT: vcmpunordsd %xmm8, %xmm8, %k1
+; AVX512BW-NEXT: vminsd %xmm8, %xmm5, %xmm0
; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vminsd %xmm0, %xmm6, %xmm0
@@ -884,25 +884,25 @@ define double @test_v8f64(<8 x double> %a0) {
; AVX512BW-NEXT: vminsd %xmm0, %xmm2, %xmm0
; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512BW-NEXT: vminsd %xmm0, %xmm8, %xmm0
-; AVX512BW-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vminsd %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_v8f64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0]
+; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
-; AVX512VL-NEXT: vminsd %xmm0, %xmm7, %xmm2
+; AVX512VL-NEXT: vminsd %xmm0, %xmm7, %xmm8
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmovsd %xmm7, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vcmpunordsd %xmm2, %xmm2, %k1
-; AVX512VL-NEXT: vminsd %xmm2, %xmm5, %xmm0
+; AVX512VL-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1}
+; AVX512VL-NEXT: vcmpunordsd %xmm8, %xmm8, %k1
+; AVX512VL-NEXT: vminsd %xmm8, %xmm5, %xmm0
; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vminsd %xmm0, %xmm6, %xmm0
@@ -917,8 +917,8 @@ define double @test_v8f64(<8 x double> %a0) {
; AVX512VL-NEXT: vminsd %xmm0, %xmm1, %xmm0
; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vminsd %xmm0, %xmm8, %xmm0
-; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vminsd %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0)
@@ -983,41 +983,40 @@ define double @test_v16f64(<16 x double> %a0) {
;
; SSE41-LABEL: test_v16f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movapd %xmm3, %xmm8
-; SSE41-NEXT: movapd %xmm4, %xmm3
-; SSE41-NEXT: minpd %xmm0, %xmm3
+; SSE41-NEXT: movapd %xmm4, %xmm8
+; SSE41-NEXT: minpd %xmm0, %xmm8
; SSE41-NEXT: cmpunordpd %xmm0, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
; SSE41-NEXT: movapd %xmm6, %xmm4
; SSE41-NEXT: minpd %xmm2, %xmm4
; SSE41-NEXT: cmpunordpd %xmm2, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm2
-; SSE41-NEXT: minpd %xmm3, %xmm2
-; SSE41-NEXT: cmpunordpd %xmm3, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: minpd %xmm8, %xmm2
+; SSE41-NEXT: cmpunordpd %xmm8, %xmm8
+; SSE41-NEXT: movapd %xmm8, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: movapd %xmm5, %xmm3
-; SSE41-NEXT: minpd %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm5, %xmm4
+; SSE41-NEXT: minpd %xmm1, %xmm4
; SSE41-NEXT: cmpunordpd %xmm1, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
; SSE41-NEXT: movapd %xmm7, %xmm1
-; SSE41-NEXT: minpd %xmm8, %xmm1
-; SSE41-NEXT: cmpunordpd %xmm8, %xmm8
-; SSE41-NEXT: movapd %xmm8, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: minpd %xmm3, %xmm4
+; SSE41-NEXT: minpd %xmm3, %xmm1
; SSE41-NEXT: cmpunordpd %xmm3, %xmm3
; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: minpd %xmm4, %xmm3
+; SSE41-NEXT: cmpunordpd %xmm4, %xmm4
+; SSE41-NEXT: movapd %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm1
; SSE41-NEXT: minpd %xmm2, %xmm1
; SSE41-NEXT: cmpunordpd %xmm2, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE41-NEXT: movapd %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
index 1a81ae4fa3c85..403502d292d10 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
@@ -460,16 +460,16 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE-NEXT: psllq $32, %xmm9
; SSE-NEXT: pmuludq %xmm6, %xmm2
; SSE-NEXT: paddq %xmm9, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm4, %xmm8
-; SSE-NEXT: movdqa %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm0, %xmm6
; SSE-NEXT: psrlq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm0, %xmm6
-; SSE-NEXT: paddq %xmm8, %xmm6
-; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm4, %xmm8
+; SSE-NEXT: psrlq $32, %xmm8
+; SSE-NEXT: pmuludq %xmm0, %xmm8
+; SSE-NEXT: paddq %xmm6, %xmm8
+; SSE-NEXT: psllq $32, %xmm8
; SSE-NEXT: pmuludq %xmm4, %xmm0
-; SSE-NEXT: paddq %xmm6, %xmm0
+; SSE-NEXT: paddq %xmm8, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: psrlq $32, %xmm4
; SSE-NEXT: pmuludq %xmm7, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
index da8ac4d8bcf8a..cb3634508e4ce 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
@@ -738,33 +738,33 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm8
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm5, %xmm8
+; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm6
+; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm5, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm9
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm9, %xmm7, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
+; AVX1-NEXT: vpxor %xmm4, %xmm8, %xmm9
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm9, %xmm7
+; AVX1-NEXT: vblendvpd %xmm7, %xmm8, %xmm6, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm7
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm2
; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vxorpd %xmm4, %xmm5, %xmm1
-; AVX1-NEXT: vxorpd %xmm4, %xmm8, %xmm2
+; AVX1-NEXT: vxorpd %xmm4, %xmm6, %xmm1
+; AVX1-NEXT: vxorpd %xmm4, %xmm5, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm8, %xmm5, %xmm1
+; AVX1-NEXT: vblendvpd %xmm1, %xmm5, %xmm6, %xmm1
; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm2
; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
index 24b3d81965578..e00380f7108ed 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
@@ -744,7 +744,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm8
+; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6
@@ -752,23 +752,23 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2
; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vxorpd %xmm4, %xmm6, %xmm1
-; AVX1-NEXT: vxorpd %xmm4, %xmm8, %xmm2
+; AVX1-NEXT: vxorpd %xmm4, %xmm5, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm6, %xmm8, %xmm1
+; AVX1-NEXT: vblendvpd %xmm1, %xmm6, %xmm5, %xmm1
; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm2
; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 2947e35c85912..4bc3176aed901 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -342,10 +342,10 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
-; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7
-; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8
+; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
@@ -363,7 +363,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll
index f686a408b0ddf..4e90e4c5fa4da 100644
--- a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll
@@ -17,105 +17,105 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
; SSE-NEXT: testl %edx, %edx
; SSE-NEXT: jle .LBB0_9
; SSE-NEXT: # %bb.1: # %for.body.preheader
-; SSE-NEXT: movl %ecx, %r9d
-; SSE-NEXT: movl %edx, %eax
+; SSE-NEXT: movl %ecx, %eax
+; SSE-NEXT: movl %edx, %r9d
; SSE-NEXT: cmpl $31, %edx
; SSE-NEXT: ja .LBB0_3
; SSE-NEXT: # %bb.2:
; SSE-NEXT: xorl %edx, %edx
; SSE-NEXT: jmp .LBB0_6
; SSE-NEXT: .LBB0_3: # %vector.ph
-; SSE-NEXT: movl %eax, %edx
+; SSE-NEXT: movl %r9d, %edx
; SSE-NEXT: andl $-32, %edx
-; SSE-NEXT: movd %r9d, %xmm0
-; SSE-NEXT: movd %r8d, %xmm2
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: movd %r8d, %xmm1
; SSE-NEXT: xorl %ecx, %ecx
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm15 = xmm2[0],zero,xmm2[1],zero
+; SSE-NEXT: pxor %xmm8, %xmm8
+; SSE-NEXT: pmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
+; SSE-NEXT: pmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero
; SSE-NEXT: .p2align 4, 0x90
; SSE-NEXT: .LBB0_4: # %vector.body
; SSE-NEXT: # =>This Inner Loop Header: Depth=1
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm11 = mem[0],zero
+; SSE-NEXT: pcmpeqb %xmm8, %xmm0
; SSE-NEXT: pmovsxbd %xmm0, %xmm7
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE-NEXT: pcmpeqb %xmm1, %xmm3
-; SSE-NEXT: pmovsxbd %xmm3, %xmm13
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; SSE-NEXT: pmovsxbd %xmm3, %xmm6
-; SSE-NEXT: pcmpeqb %xmm1, %xmm4
-; SSE-NEXT: pmovsxbd %xmm4, %xmm11
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1]
-; SSE-NEXT: pmovsxbd %xmm3, %xmm2
-; SSE-NEXT: pcmpeqb %xmm1, %xmm5
-; SSE-NEXT: pmovsxbd %xmm5, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
-; SSE-NEXT: pmovsxbd %xmm3, %xmm10
-; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: pslld %xmm14, %xmm4
-; SSE-NEXT: pslld %xmm15, %xmm3
-; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm3
-; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm8
-; SSE-NEXT: movdqa %xmm8, %xmm5
-; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm15, %xmm8
+; SSE-NEXT: pcmpeqb %xmm8, %xmm1
+; SSE-NEXT: pmovsxbd %xmm1, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; SSE-NEXT: pmovsxbd %xmm1, %xmm6
+; SSE-NEXT: pcmpeqb %xmm8, %xmm2
+; SSE-NEXT: pmovsxbd %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; SSE-NEXT: pmovsxbd %xmm1, %xmm4
+; SSE-NEXT: pcmpeqb %xmm8, %xmm11
+; SSE-NEXT: pmovsxbd %xmm11, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,1,1]
+; SSE-NEXT: pmovsxbd %xmm2, %xmm2
+; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm11
+; SSE-NEXT: movdqa %xmm11, %xmm12
+; SSE-NEXT: pslld %xmm9, %xmm12
+; SSE-NEXT: pslld %xmm10, %xmm11
+; SSE-NEXT: blendvps %xmm0, %xmm12, %xmm11
+; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm12
+; SSE-NEXT: movdqa %xmm12, %xmm13
+; SSE-NEXT: pslld %xmm9, %xmm13
+; SSE-NEXT: pslld %xmm10, %xmm12
; SSE-NEXT: movdqa %xmm7, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm8
-; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm12
-; SSE-NEXT: movdqa %xmm12, %xmm5
-; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm15, %xmm12
+; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm12
+; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm7
+; SSE-NEXT: movdqa %xmm7, %xmm13
+; SSE-NEXT: pslld %xmm9, %xmm13
+; SSE-NEXT: pslld %xmm10, %xmm7
; SSE-NEXT: movdqa %xmm6, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm12
+; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm7
; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm6
-; SSE-NEXT: movdqa %xmm6, %xmm5
-; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm15, %xmm6
-; SSE-NEXT: movdqa %xmm13, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm6
-; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm15, %xmm1
+; SSE-NEXT: movdqa %xmm6, %xmm13
+; SSE-NEXT: pslld %xmm9, %xmm13
+; SSE-NEXT: pslld %xmm10, %xmm6
+; SSE-NEXT: movdqa %xmm5, %xmm0
+; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm6
+; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm13
+; SSE-NEXT: pslld %xmm9, %xmm13
+; SSE-NEXT: pslld %xmm10, %xmm5
+; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm5
+; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm13
+; SSE-NEXT: pslld %xmm9, %xmm13
+; SSE-NEXT: pslld %xmm10, %xmm4
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm4
+; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm13
+; SSE-NEXT: pslld %xmm9, %xmm13
+; SSE-NEXT: pslld %xmm10, %xmm3
; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm1
-; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm2
-; SSE-NEXT: pslld %xmm14, %xmm2
-; SSE-NEXT: pslld %xmm15, %xmm5
-; SSE-NEXT: movdqa %xmm11, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm5
-; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: pslld %xmm14, %xmm4
-; SSE-NEXT: pslld %xmm15, %xmm2
-; SSE-NEXT: movdqa %xmm10, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm2
-; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm7
-; SSE-NEXT: pslld %xmm14, %xmm7
-; SSE-NEXT: pslld %xmm15, %xmm4
-; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm7, %xmm4
-; SSE-NEXT: movups %xmm8, (%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm3, 16(%rdi,%rcx,4)
+; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm3
+; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm13
+; SSE-NEXT: pslld %xmm9, %xmm13
+; SSE-NEXT: pslld %xmm10, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm2
+; SSE-NEXT: movups %xmm12, (%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm11, 16(%rdi,%rcx,4)
; SSE-NEXT: movups %xmm6, 32(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm12, 48(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm5, 64(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm1, 80(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm4, 96(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm2, 112(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm7, 48(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm4, 64(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm5, 80(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm2, 96(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm3, 112(%rdi,%rcx,4)
; SSE-NEXT: addq $32, %rcx
; SSE-NEXT: cmpq %rcx, %rdx
; SSE-NEXT: jne .LBB0_4
; SSE-NEXT: # %bb.5: # %middle.block
-; SSE-NEXT: cmpq %rax, %rdx
+; SSE-NEXT: cmpq %r9, %rdx
; SSE-NEXT: jne .LBB0_6
; SSE-NEXT: .LBB0_9: # %for.cond.cleanup
; SSE-NEXT: retq
@@ -125,12 +125,12 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; SSE-NEXT: shll %cl, (%rdi,%rdx,4)
; SSE-NEXT: incq %rdx
-; SSE-NEXT: cmpq %rdx, %rax
+; SSE-NEXT: cmpq %rdx, %r9
; SSE-NEXT: je .LBB0_9
; SSE-NEXT: .LBB0_6: # %for.body
; SSE-NEXT: # =>This Inner Loop Header: Depth=1
; SSE-NEXT: cmpb $0, (%rsi,%rdx)
-; SSE-NEXT: movl %r9d, %ecx
+; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: je .LBB0_8
; SSE-NEXT: # %bb.7: # %for.body
; SSE-NEXT: # in Loop: Header=BB0_6 Depth=1
@@ -142,104 +142,102 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
; AVX1-NEXT: testl %edx, %edx
; AVX1-NEXT: jle .LBB0_9
; AVX1-NEXT: # %bb.1: # %for.body.preheader
-; AVX1-NEXT: movl %ecx, %r9d
-; AVX1-NEXT: movl %edx, %eax
+; AVX1-NEXT: movl %ecx, %eax
+; AVX1-NEXT: movl %edx, %r9d
; AVX1-NEXT: cmpl $31, %edx
; AVX1-NEXT: ja .LBB0_3
; AVX1-NEXT: # %bb.2:
; AVX1-NEXT: xorl %edx, %edx
; AVX1-NEXT: jmp .LBB0_6
; AVX1-NEXT: .LBB0_3: # %vector.ph
-; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: movl %r9d, %edx
; AVX1-NEXT: andl $-32, %edx
-; AVX1-NEXT: vmovd %r9d, %xmm0
+; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vmovd %r8d, %xmm1
; AVX1-NEXT: xorl %ecx, %ecx
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB0_4: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
-; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12
-; AVX1-NEXT: vpcmpeqb %xmm1, %xmm12, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqb %xmm2, %xmm12, %xmm2
-; AVX1-NEXT: vpmovsxbd %xmm2, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqb %xmm3, %xmm12, %xmm3
-; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm7 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm8
-; AVX1-NEXT: vpslld %xmm7, %xmm8, %xmm9
-; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm10 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vpslld %xmm10, %xmm8, %xmm0
-; AVX1-NEXT: vblendvps %xmm5, %xmm9, %xmm0, %xmm8
-; AVX1-NEXT: vpmovsxbd %xmm3, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm4, %xmm12, %xmm4
-; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm0
-; AVX1-NEXT: vpslld %xmm7, %xmm0, %xmm7
-; AVX1-NEXT: vpslld %xmm10, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm4, %xmm9
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
-; AVX1-NEXT: vpmovsxbd %xmm4, %xmm12
-; AVX1-NEXT: vblendvps %xmm1, %xmm7, %xmm0, %xmm10
-; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm1
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpslld %xmm0, %xmm1, %xmm7
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpslld %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vblendvps %xmm6, %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm6
-; AVX1-NEXT: vpslld %xmm0, %xmm6, %xmm7
-; AVX1-NEXT: vpslld %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vblendvps %xmm2, %xmm7, %xmm6, %xmm2
-; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm6
-; AVX1-NEXT: vpslld %xmm13, %xmm6, %xmm7
-; AVX1-NEXT: vpslld %xmm14, %xmm6, %xmm6
-; AVX1-NEXT: vblendvps %xmm5, %xmm7, %xmm6, %xmm5
-; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm6
-; AVX1-NEXT: vpslld %xmm13, %xmm6, %xmm7
-; AVX1-NEXT: vpslld %xmm14, %xmm6, %xmm6
-; AVX1-NEXT: vblendvps %xmm3, %xmm7, %xmm6, %xmm3
-; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm6
-; AVX1-NEXT: vpslld %xmm15, %xmm6, %xmm7
-; AVX1-NEXT: vpslld %xmm11, %xmm6, %xmm6
-; AVX1-NEXT: vblendvps %xmm9, %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm7
-; AVX1-NEXT: vpslld %xmm15, %xmm7, %xmm0
-; AVX1-NEXT: vpslld %xmm11, %xmm7, %xmm7
-; AVX1-NEXT: vblendvps %xmm12, %xmm0, %xmm7, %xmm0
-; AVX1-NEXT: vmovups %xmm8, (%rdi,%rcx,4)
-; AVX1-NEXT: vmovups %xmm10, 16(%rdi,%rcx,4)
-; AVX1-NEXT: vmovups %xmm1, 32(%rdi,%rcx,4)
-; AVX1-NEXT: vmovups %xmm2, 48(%rdi,%rcx,4)
-; AVX1-NEXT: vmovups %xmm5, 64(%rdi,%rcx,4)
-; AVX1-NEXT: vmovups %xmm3, 80(%rdi,%rcx,4)
-; AVX1-NEXT: vmovups %xmm6, 96(%rdi,%rcx,4)
-; AVX1-NEXT: vmovups %xmm0, 112(%rdi,%rcx,4)
+; AVX1-NEXT: vmovq {{.*#+}} xmm9 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm10 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm11 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm12 = mem[0],zero
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm9, %xmm9
+; AVX1-NEXT: vpmovsxbd %xmm9, %xmm13
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1]
+; AVX1-NEXT: vpmovsxbd %xmm9, %xmm9
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm10, %xmm10
+; AVX1-NEXT: vpmovsxbd %xmm10, %xmm14
+; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1]
+; AVX1-NEXT: vpmovsxbd %xmm10, %xmm10
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm11, %xmm11
+; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm15 = mem[0],zero,mem[1],zero
+; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm0
+; AVX1-NEXT: vpslld %xmm15, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm2 = mem[0],zero,mem[1],zero
+; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm13, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm11, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,1,1]
+; AVX1-NEXT: vpmovsxbd %xmm11, %xmm11
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm12, %xmm12
+; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm13
+; AVX1-NEXT: vpslld %xmm15, %xmm13, %xmm15
+; AVX1-NEXT: vpslld %xmm2, %xmm13, %xmm2
+; AVX1-NEXT: vpmovsxbd %xmm12, %xmm13
+; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1]
+; AVX1-NEXT: vpmovsxbd %xmm12, %xmm12
+; AVX1-NEXT: vblendvps %xmm9, %xmm15, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm9
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX1-NEXT: vpslld %xmm3, %xmm9, %xmm15
+; AVX1-NEXT: vpslld %xmm4, %xmm9, %xmm9
+; AVX1-NEXT: vblendvps %xmm14, %xmm15, %xmm9, %xmm9
+; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm14
+; AVX1-NEXT: vpslld %xmm3, %xmm14, %xmm15
+; AVX1-NEXT: vpslld %xmm4, %xmm14, %xmm14
+; AVX1-NEXT: vblendvps %xmm10, %xmm15, %xmm14, %xmm10
+; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm14
+; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15
+; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14
+; AVX1-NEXT: vblendvps %xmm1, %xmm15, %xmm14, %xmm1
+; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm14
+; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15
+; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14
+; AVX1-NEXT: vblendvps %xmm11, %xmm15, %xmm14, %xmm11
+; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm14
+; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15
+; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14
+; AVX1-NEXT: vblendvps %xmm13, %xmm15, %xmm14, %xmm13
+; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm14
+; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15
+; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14
+; AVX1-NEXT: vblendvps %xmm12, %xmm15, %xmm14, %xmm12
+; AVX1-NEXT: vmovups %xmm0, (%rdi,%rcx,4)
+; AVX1-NEXT: vmovups %xmm2, 16(%rdi,%rcx,4)
+; AVX1-NEXT: vmovups %xmm9, 32(%rdi,%rcx,4)
+; AVX1-NEXT: vmovups %xmm10, 48(%rdi,%rcx,4)
+; AVX1-NEXT: vmovups %xmm1, 64(%rdi,%rcx,4)
+; AVX1-NEXT: vmovups %xmm11, 80(%rdi,%rcx,4)
+; AVX1-NEXT: vmovups %xmm13, 96(%rdi,%rcx,4)
+; AVX1-NEXT: vmovups %xmm12, 112(%rdi,%rcx,4)
; AVX1-NEXT: addq $32, %rcx
; AVX1-NEXT: cmpq %rcx, %rdx
; AVX1-NEXT: jne .LBB0_4
; AVX1-NEXT: # %bb.5: # %middle.block
-; AVX1-NEXT: cmpq %rax, %rdx
+; AVX1-NEXT: cmpq %r9, %rdx
; AVX1-NEXT: jne .LBB0_6
; AVX1-NEXT: .LBB0_9: # %for.cond.cleanup
; AVX1-NEXT: vzeroupper
@@ -250,12 +248,12 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
; AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
; AVX1-NEXT: shll %cl, (%rdi,%rdx,4)
; AVX1-NEXT: incq %rdx
-; AVX1-NEXT: cmpq %rdx, %rax
+; AVX1-NEXT: cmpq %rdx, %r9
; AVX1-NEXT: je .LBB0_9
; AVX1-NEXT: .LBB0_6: # %for.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: cmpb $0, (%rsi,%rdx)
-; AVX1-NEXT: movl %r9d, %ecx
+; AVX1-NEXT: movl %eax, %ecx
; AVX1-NEXT: je .LBB0_8
; AVX1-NEXT: # %bb.7: # %for.body
; AVX1-NEXT: # in Loop: Header=BB0_6 Depth=1
@@ -267,17 +265,17 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
; AVX2-NEXT: testl %edx, %edx
; AVX2-NEXT: jle .LBB0_9
; AVX2-NEXT: # %bb.1: # %for.body.preheader
-; AVX2-NEXT: movl %ecx, %r9d
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: movl %edx, %r9d
; AVX2-NEXT: cmpl $31, %edx
; AVX2-NEXT: ja .LBB0_3
; AVX2-NEXT: # %bb.2:
; AVX2-NEXT: xorl %edx, %edx
; AVX2-NEXT: jmp .LBB0_6
; AVX2-NEXT: .LBB0_3: # %vector.ph
-; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: movl %r9d, %edx
; AVX2-NEXT: andl $-32, %edx
-; AVX2-NEXT: vmovd %r9d, %xmm0
+; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
; AVX2-NEXT: vmovd %r8d, %xmm1
; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
@@ -314,7 +312,7 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
; AVX2-NEXT: cmpq %rcx, %rdx
; AVX2-NEXT: jne .LBB0_4
; AVX2-NEXT: # %bb.5: # %middle.block
-; AVX2-NEXT: cmpq %rax, %rdx
+; AVX2-NEXT: cmpq %r9, %rdx
; AVX2-NEXT: jne .LBB0_6
; AVX2-NEXT: .LBB0_9: # %for.cond.cleanup
; AVX2-NEXT: vzeroupper
@@ -325,12 +323,12 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
; AVX2-NEXT: shll %cl, (%rdi,%rdx,4)
; AVX2-NEXT: incq %rdx
-; AVX2-NEXT: cmpq %rdx, %rax
+; AVX2-NEXT: cmpq %rdx, %r9
; AVX2-NEXT: je .LBB0_9
; AVX2-NEXT: .LBB0_6: # %for.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: cmpb $0, (%rsi,%rdx)
-; AVX2-NEXT: movl %r9d, %ecx
+; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: je .LBB0_8
; AVX2-NEXT: # %bb.7: # %for.body
; AVX2-NEXT: # in Loop: Header=BB0_6 Depth=1
@@ -342,78 +340,78 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
; XOP-NEXT: testl %edx, %edx
; XOP-NEXT: jle .LBB0_9
; XOP-NEXT: # %bb.1: # %for.body.preheader
-; XOP-NEXT: movl %ecx, %r9d
-; XOP-NEXT: movl %edx, %eax
+; XOP-NEXT: movl %ecx, %eax
+; XOP-NEXT: movl %edx, %r9d
; XOP-NEXT: cmpl $31, %edx
; XOP-NEXT: ja .LBB0_3
; XOP-NEXT: # %bb.2:
; XOP-NEXT: xorl %edx, %edx
; XOP-NEXT: jmp .LBB0_6
; XOP-NEXT: .LBB0_3: # %vector.ph
-; XOP-NEXT: movl %eax, %edx
+; XOP-NEXT: movl %r9d, %edx
; XOP-NEXT: andl $-32, %edx
-; XOP-NEXT: vmovd %r9d, %xmm0
+; XOP-NEXT: vmovd %eax, %xmm0
; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9
+; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; XOP-NEXT: vmovd %r8d, %xmm1
; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm14
+; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; XOP-NEXT: xorl %ecx, %ecx
-; XOP-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; XOP-NEXT: vextractf128 $1, %ymm9, %xmm15
-; XOP-NEXT: vextractf128 $1, %ymm14, %xmm4
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT: vextractf128 $1, %ymm1, %xmm4
; XOP-NEXT: .p2align 4, 0x90
; XOP-NEXT: .LBB0_4: # %vector.body
; XOP-NEXT: # =>This Inner Loop Header: Depth=1
; XOP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; XOP-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero
; XOP-NEXT: vmovq {{.*#+}} xmm7 = mem[0],zero
-; XOP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; XOP-NEXT: vpcomeqb %xmm8, %xmm5, %xmm5
-; XOP-NEXT: vpmovsxbd %xmm5, %xmm0
+; XOP-NEXT: vmovq {{.*#+}} xmm8 = mem[0],zero
+; XOP-NEXT: vpcomeqb %xmm2, %xmm5, %xmm5
+; XOP-NEXT: vpmovsxbd %xmm5, %xmm9
; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
; XOP-NEXT: vpmovsxbd %xmm5, %xmm5
-; XOP-NEXT: vpcomeqb %xmm8, %xmm6, %xmm6
+; XOP-NEXT: vpcomeqb %xmm2, %xmm6, %xmm6
; XOP-NEXT: vpmovsxbd %xmm6, %xmm10
; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1]
; XOP-NEXT: vpmovsxbd %xmm6, %xmm6
-; XOP-NEXT: vpcomeqb %xmm8, %xmm7, %xmm7
+; XOP-NEXT: vpcomeqb %xmm2, %xmm7, %xmm7
; XOP-NEXT: vpmovsxbd %xmm7, %xmm11
; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1]
; XOP-NEXT: vpmovsxbd %xmm7, %xmm7
-; XOP-NEXT: vpcomeqb %xmm8, %xmm2, %xmm2
-; XOP-NEXT: vpmovsxbd %xmm2, %xmm12
-; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; XOP-NEXT: vpmovsxbd %xmm2, %xmm2
-; XOP-NEXT: vblendvps %xmm5, %xmm15, %xmm4, %xmm5
-; XOP-NEXT: vpshld %xmm5, 16(%rdi,%rcx,4), %xmm13
-; XOP-NEXT: vblendvps %xmm0, %xmm9, %xmm14, %xmm0
-; XOP-NEXT: vpshld %xmm0, (%rdi,%rcx,4), %xmm0
-; XOP-NEXT: vblendvps %xmm6, %xmm15, %xmm4, %xmm6
+; XOP-NEXT: vpcomeqb %xmm2, %xmm8, %xmm8
+; XOP-NEXT: vpmovsxbd %xmm8, %xmm12
+; XOP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,1,1]
+; XOP-NEXT: vpmovsxbd %xmm8, %xmm8
+; XOP-NEXT: vblendvps %xmm5, %xmm3, %xmm4, %xmm5
+; XOP-NEXT: vpshld %xmm5, 16(%rdi,%rcx,4), %xmm5
+; XOP-NEXT: vblendvps %xmm9, %xmm0, %xmm1, %xmm9
+; XOP-NEXT: vpshld %xmm9, (%rdi,%rcx,4), %xmm9
+; XOP-NEXT: vblendvps %xmm6, %xmm3, %xmm4, %xmm6
; XOP-NEXT: vpshld %xmm6, 48(%rdi,%rcx,4), %xmm6
-; XOP-NEXT: vblendvps %xmm10, %xmm9, %xmm14, %xmm5
-; XOP-NEXT: vpshld %xmm5, 32(%rdi,%rcx,4), %xmm5
-; XOP-NEXT: vblendvps %xmm7, %xmm15, %xmm4, %xmm7
+; XOP-NEXT: vblendvps %xmm10, %xmm0, %xmm1, %xmm10
+; XOP-NEXT: vpshld %xmm10, 32(%rdi,%rcx,4), %xmm10
+; XOP-NEXT: vblendvps %xmm7, %xmm3, %xmm4, %xmm7
; XOP-NEXT: vpshld %xmm7, 80(%rdi,%rcx,4), %xmm7
-; XOP-NEXT: vblendvps %xmm11, %xmm9, %xmm14, %xmm1
-; XOP-NEXT: vpshld %xmm1, 64(%rdi,%rcx,4), %xmm1
-; XOP-NEXT: vblendvps %xmm2, %xmm15, %xmm4, %xmm2
-; XOP-NEXT: vpshld %xmm2, 112(%rdi,%rcx,4), %xmm2
-; XOP-NEXT: vblendvps %xmm12, %xmm9, %xmm14, %xmm3
-; XOP-NEXT: vpshld %xmm3, 96(%rdi,%rcx,4), %xmm3
-; XOP-NEXT: vmovdqu %xmm0, (%rdi,%rcx,4)
-; XOP-NEXT: vmovdqu %xmm13, 16(%rdi,%rcx,4)
-; XOP-NEXT: vmovdqu %xmm5, 32(%rdi,%rcx,4)
+; XOP-NEXT: vblendvps %xmm11, %xmm0, %xmm1, %xmm11
+; XOP-NEXT: vpshld %xmm11, 64(%rdi,%rcx,4), %xmm11
+; XOP-NEXT: vblendvps %xmm8, %xmm3, %xmm4, %xmm8
+; XOP-NEXT: vpshld %xmm8, 112(%rdi,%rcx,4), %xmm8
+; XOP-NEXT: vblendvps %xmm12, %xmm0, %xmm1, %xmm12
+; XOP-NEXT: vpshld %xmm12, 96(%rdi,%rcx,4), %xmm12
+; XOP-NEXT: vmovdqu %xmm9, (%rdi,%rcx,4)
+; XOP-NEXT: vmovdqu %xmm5, 16(%rdi,%rcx,4)
+; XOP-NEXT: vmovdqu %xmm10, 32(%rdi,%rcx,4)
; XOP-NEXT: vmovdqu %xmm6, 48(%rdi,%rcx,4)
-; XOP-NEXT: vmovdqu %xmm1, 64(%rdi,%rcx,4)
+; XOP-NEXT: vmovdqu %xmm11, 64(%rdi,%rcx,4)
; XOP-NEXT: vmovdqu %xmm7, 80(%rdi,%rcx,4)
-; XOP-NEXT: vmovdqu %xmm3, 96(%rdi,%rcx,4)
-; XOP-NEXT: vmovdqu %xmm2, 112(%rdi,%rcx,4)
+; XOP-NEXT: vmovdqu %xmm12, 96(%rdi,%rcx,4)
+; XOP-NEXT: vmovdqu %xmm8, 112(%rdi,%rcx,4)
; XOP-NEXT: addq $32, %rcx
; XOP-NEXT: cmpq %rcx, %rdx
; XOP-NEXT: jne .LBB0_4
; XOP-NEXT: # %bb.5: # %middle.block
-; XOP-NEXT: cmpq %rax, %rdx
+; XOP-NEXT: cmpq %r9, %rdx
; XOP-NEXT: jne .LBB0_6
; XOP-NEXT: .LBB0_9: # %for.cond.cleanup
; XOP-NEXT: vzeroupper
@@ -424,12 +422,12 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
; XOP-NEXT: # kill: def $cl killed $cl killed $ecx
; XOP-NEXT: shll %cl, (%rdi,%rdx,4)
; XOP-NEXT: incq %rdx
-; XOP-NEXT: cmpq %rdx, %rax
+; XOP-NEXT: cmpq %rdx, %r9
; XOP-NEXT: je .LBB0_9
; XOP-NEXT: .LBB0_6: # %for.body
; XOP-NEXT: # =>This Inner Loop Header: Depth=1
; XOP-NEXT: cmpb $0, (%rsi,%rdx)
-; XOP-NEXT: movl %r9d, %ecx
+; XOP-NEXT: movl %eax, %ecx
; XOP-NEXT: je .LBB0_8
; XOP-NEXT: # %bb.7: # %for.body
; XOP-NEXT: # in Loop: Header=BB0_6 Depth=1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
index 7a12f5075055b..106059017e54c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
@@ -11,10 +11,10 @@ define <64 x i8> @f1(ptr %p0) {
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,1,3,7,9,13,15,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
@@ -23,24 +23,24 @@ define <64 x i8> @f1(ptr %p0) {
; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13>
; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128>
-; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128>
+; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vmovdqa 112(%rdi), %xmm2
-; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2-NEXT: vmovdqa 96(%rdi), %xmm4
-; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2
; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa 176(%rdi), %xmm2
; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3
; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
@@ -153,39 +153,39 @@ define <64 x i8> @f2(ptr %p0) {
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,1,5,7,11,13,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,1,5,7,11,13,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15>
; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128>
-; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128>
+; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovdqa 96(%rdi), %xmm2
-; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa 112(%rdi), %xmm3
-; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3
-; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa 112(%rdi), %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa 176(%rdi), %xmm2
; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3
; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: f2:
@@ -293,19 +293,19 @@ define <64 x i8> @f3(ptr %p0) {
; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128>
-; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14>
-; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa (%rdi), %xmm4
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm6
; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm4
; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4
; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,2,4,8,10,14,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
@@ -314,16 +314,16 @@ define <64 x i8> @f3(ptr %p0) {
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovdqa 160(%rdi), %xmm2
-; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa 176(%rdi), %xmm5
-; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5
-; AVX2-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa 176(%rdi), %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa 112(%rdi), %xmm5
-; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX2-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vmovdqa 112(%rdi), %xmm3
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm5
+; AVX2-NEXT: vpor %xmm3, %xmm5, %xmm3
; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
@@ -441,39 +441,39 @@ define <64 x i8> @f4(ptr %p0) {
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,0,4,6,10,12,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,0,4,6,10,12,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14>
; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128>
-; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128>
+; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovdqa 96(%rdi), %xmm2
-; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa 112(%rdi), %xmm3
-; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3
-; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa 112(%rdi), %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa 176(%rdi), %xmm2
; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3
; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: f4:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
index 8d3961da5dc4a..727b3ff2eb45c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -244,10 +244,10 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; SSE2-NEXT: # kill: def $edx killed $edx def $rdx
; SSE2-NEXT: # kill: def $esi killed $esi def $rsi
; SSE2-NEXT: # kill: def $edi killed $edi def $rdi
-; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
-; SSE2-NEXT: andl $7, %r10d
; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $7, %eax
+; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT: andl $7, %r10d
; SSE2-NEXT: andl $7, %edi
; SSE2-NEXT: andl $7, %esi
; SSE2-NEXT: andl $7, %edx
@@ -271,9 +271,9 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
@@ -288,10 +288,10 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; SSSE3-NEXT: # kill: def $edx killed $edx def $rdx
; SSSE3-NEXT: # kill: def $esi killed $esi def $rsi
; SSSE3-NEXT: # kill: def $edi killed $edi def $rdi
-; SSSE3-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
-; SSSE3-NEXT: andl $7, %r10d
; SSSE3-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $7, %eax
+; SSSE3-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT: andl $7, %r10d
; SSSE3-NEXT: andl $7, %edi
; SSSE3-NEXT: andl $7, %esi
; SSSE3-NEXT: andl $7, %edx
@@ -315,9 +315,9 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %ecx
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
@@ -332,10 +332,10 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; SSE41-NEXT: # kill: def $edx killed $edx def $rdx
; SSE41-NEXT: # kill: def $esi killed $esi def $rsi
; SSE41-NEXT: # kill: def $edi killed $edi def $rdi
-; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
-; SSE41-NEXT: andl $7, %r10d
; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; SSE41-NEXT: andl $7, %eax
+; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; SSE41-NEXT: andl $7, %r10d
; SSE41-NEXT: andl $7, %edi
; SSE41-NEXT: andl $7, %esi
; SSE41-NEXT: andl $7, %edx
@@ -350,8 +350,8 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; SSE41-NEXT: pinsrw $3, -24(%rsp,%rcx,2), %xmm0
; SSE41-NEXT: pinsrw $4, -24(%rsp,%r8,2), %xmm0
; SSE41-NEXT: pinsrw $5, -24(%rsp,%r9,2), %xmm0
-; SSE41-NEXT: pinsrw $6, -24(%rsp,%rax,2), %xmm0
-; SSE41-NEXT: pinsrw $7, -24(%rsp,%r10,2), %xmm0
+; SSE41-NEXT: pinsrw $6, -24(%rsp,%r10,2), %xmm0
+; SSE41-NEXT: pinsrw $7, -24(%rsp,%rax,2), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
@@ -362,10 +362,10 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; AVX-NEXT: # kill: def $edx killed $edx def $rdx
; AVX-NEXT: # kill: def $esi killed $esi def $rsi
; AVX-NEXT: # kill: def $edi killed $edi def $rdi
-; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
-; AVX-NEXT: andl $7, %r10d
; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; AVX-NEXT: andl $7, %eax
+; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; AVX-NEXT: andl $7, %r10d
; AVX-NEXT: andl $7, %edi
; AVX-NEXT: andl $7, %esi
; AVX-NEXT: andl $7, %edx
@@ -380,8 +380,8 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; AVX-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
; AVX-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
; AVX-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $7, -24(%rsp,%r10,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $6, -24(%rsp,%r10,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
; AVX-NEXT: retq
%x0 = extractelement <8 x i16> %x, i16 %i0
%x1 = extractelement <8 x i16> %x, i16 %i1
@@ -415,15 +415,15 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE2-NEXT: movd %eax, %xmm8
+; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE2-NEXT: movd %eax, %xmm15
+; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: movd %eax, %xmm4
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
@@ -431,7 +431,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE2-NEXT: movd %eax, %xmm10
+; SSE2-NEXT: movd %eax, %xmm5
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
@@ -439,51 +439,51 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE2-NEXT: movd %eax, %xmm11
+; SSE2-NEXT: movd %eax, %xmm8
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm6
; SSE2-NEXT: andl $15, %ecx
; SSE2-NEXT: movzbl -24(%rsp,%rcx), %eax
-; SSE2-NEXT: movd %eax, %xmm12
+; SSE2-NEXT: movd %eax, %xmm9
; SSE2-NEXT: andl $15, %edx
; SSE2-NEXT: movzbl -24(%rsp,%rdx), %eax
-; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: movd %eax, %xmm10
; SSE2-NEXT: andl $15, %esi
; SSE2-NEXT: movzbl -24(%rsp,%rsi), %eax
-; SSE2-NEXT: movd %eax, %xmm13
+; SSE2-NEXT: movd %eax, %xmm11
; SSE2-NEXT: andl $15, %edi
; SSE2-NEXT: movzbl -24(%rsp,%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: andl $15, %r9d
; SSE2-NEXT: movzbl -24(%rsp,%r9), %eax
-; SSE2-NEXT: movd %eax, %xmm14
+; SSE2-NEXT: movd %eax, %xmm12
; SSE2-NEXT: andl $15, %r8d
; SSE2-NEXT: movzbl -24(%rsp,%r8), %eax
-; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movd %eax, %xmm13
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movd %eax, %xmm14
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE2-NEXT: movd %eax, %xmm15
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSE2-NEXT: retq
;
@@ -499,15 +499,15 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm8
+; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm15
+; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm9
+; SSSE3-NEXT: movd %eax, %xmm4
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
@@ -515,7 +515,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm10
+; SSSE3-NEXT: movd %eax, %xmm5
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
@@ -523,51 +523,51 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm11
+; SSSE3-NEXT: movd %eax, %xmm8
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm6
; SSSE3-NEXT: andl $15, %ecx
; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %eax
-; SSSE3-NEXT: movd %eax, %xmm12
+; SSSE3-NEXT: movd %eax, %xmm9
; SSSE3-NEXT: andl $15, %edx
; SSSE3-NEXT: movzbl -24(%rsp,%rdx), %eax
-; SSSE3-NEXT: movd %eax, %xmm5
+; SSSE3-NEXT: movd %eax, %xmm10
; SSSE3-NEXT: andl $15, %esi
; SSSE3-NEXT: movzbl -24(%rsp,%rsi), %eax
-; SSSE3-NEXT: movd %eax, %xmm13
+; SSSE3-NEXT: movd %eax, %xmm11
; SSSE3-NEXT: andl $15, %edi
; SSSE3-NEXT: movzbl -24(%rsp,%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: andl $15, %r9d
; SSSE3-NEXT: movzbl -24(%rsp,%r9), %eax
-; SSSE3-NEXT: movd %eax, %xmm14
+; SSSE3-NEXT: movd %eax, %xmm12
; SSSE3-NEXT: andl $15, %r8d
; SSSE3-NEXT: movzbl -24(%rsp,%r8), %eax
-; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: movd %eax, %xmm13
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm4
+; SSSE3-NEXT: movd %eax, %xmm14
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSSE3-NEXT: movd %eax, %xmm15
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSSE3-NEXT: retq
;
@@ -820,85 +820,85 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: movzbl (%rdi), %eax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movzbl 1(%rdi), %r9d
-; SSE2-NEXT: movzbl 2(%rdi), %r10d
-; SSE2-NEXT: movzbl 3(%rdi), %r11d
-; SSE2-NEXT: movzbl 4(%rdi), %r14d
-; SSE2-NEXT: movzbl 5(%rdi), %r15d
-; SSE2-NEXT: movzbl 6(%rdi), %r12d
-; SSE2-NEXT: movzbl 7(%rdi), %r13d
+; SSE2-NEXT: movzbl 1(%rdi), %ecx
+; SSE2-NEXT: movzbl 2(%rdi), %edx
+; SSE2-NEXT: movzbl 3(%rdi), %esi
+; SSE2-NEXT: movzbl 4(%rdi), %r8d
+; SSE2-NEXT: movzbl 5(%rdi), %r9d
+; SSE2-NEXT: movzbl 6(%rdi), %r10d
+; SSE2-NEXT: movzbl 7(%rdi), %r11d
; SSE2-NEXT: movzbl 8(%rdi), %ebx
-; SSE2-NEXT: movzbl 9(%rdi), %r8d
-; SSE2-NEXT: movzbl 10(%rdi), %ecx
-; SSE2-NEXT: movzbl 11(%rdi), %edx
-; SSE2-NEXT: movzbl 12(%rdi), %esi
+; SSE2-NEXT: movzbl 9(%rdi), %r14d
+; SSE2-NEXT: movzbl 10(%rdi), %r15d
+; SSE2-NEXT: movzbl 11(%rdi), %r12d
+; SSE2-NEXT: movzbl 12(%rdi), %r13d
; SSE2-NEXT: movzbl 13(%rdi), %ebp
; SSE2-NEXT: movzbl 14(%rdi), %eax
; SSE2-NEXT: movzbl 15(%rdi), %edi
; SSE2-NEXT: andl $15, %edi
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl -24(%rsp,%rdi), %edi
-; SSE2-NEXT: movd %edi, %xmm8
+; SSE2-NEXT: movd %edi, %xmm1
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSE2-NEXT: movd %eax, %xmm15
+; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: andl $15, %ebp
; SSE2-NEXT: movzbl -24(%rsp,%rbp), %eax
-; SSE2-NEXT: movd %eax, %xmm9
-; SSE2-NEXT: andl $15, %esi
-; SSE2-NEXT: movzbl -24(%rsp,%rsi), %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: andl $15, %edx
-; SSE2-NEXT: movzbl -24(%rsp,%rdx), %eax
-; SSE2-NEXT: movd %eax, %xmm10
-; SSE2-NEXT: andl $15, %ecx
-; SSE2-NEXT: movzbl -24(%rsp,%rcx), %eax
-; SSE2-NEXT: movd %eax, %xmm7
-; SSE2-NEXT: andl $15, %r8d
-; SSE2-NEXT: movzbl -24(%rsp,%r8), %eax
-; SSE2-NEXT: movd %eax, %xmm11
-; SSE2-NEXT: andl $15, %ebx
-; SSE2-NEXT: movzbl -24(%rsp,%rbx), %eax
-; SSE2-NEXT: movd %eax, %xmm6
+; SSE2-NEXT: movd %eax, %xmm4
; SSE2-NEXT: andl $15, %r13d
; SSE2-NEXT: movzbl -24(%rsp,%r13), %eax
-; SSE2-NEXT: movd %eax, %xmm12
+; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: andl $15, %r12d
; SSE2-NEXT: movzbl -24(%rsp,%r12), %eax
; SSE2-NEXT: movd %eax, %xmm5
; SSE2-NEXT: andl $15, %r15d
; SSE2-NEXT: movzbl -24(%rsp,%r15), %eax
-; SSE2-NEXT: movd %eax, %xmm13
+; SSE2-NEXT: movd %eax, %xmm7
; SSE2-NEXT: andl $15, %r14d
; SSE2-NEXT: movzbl -24(%rsp,%r14), %eax
-; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movd %eax, %xmm8
+; SSE2-NEXT: andl $15, %ebx
+; SSE2-NEXT: movzbl -24(%rsp,%rbx), %eax
+; SSE2-NEXT: movd %eax, %xmm6
; SSE2-NEXT: andl $15, %r11d
; SSE2-NEXT: movzbl -24(%rsp,%r11), %eax
-; SSE2-NEXT: movd %eax, %xmm14
+; SSE2-NEXT: movd %eax, %xmm9
; SSE2-NEXT: andl $15, %r10d
; SSE2-NEXT: movzbl -24(%rsp,%r10), %eax
-; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movd %eax, %xmm10
; SSE2-NEXT: andl $15, %r9d
; SSE2-NEXT: movzbl -24(%rsp,%r9), %eax
-; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: movd %eax, %xmm11
+; SSE2-NEXT: andl $15, %r8d
+; SSE2-NEXT: movzbl -24(%rsp,%r8), %eax
+; SSE2-NEXT: movd %eax, %xmm12
+; SSE2-NEXT: andl $15, %esi
+; SSE2-NEXT: movzbl -24(%rsp,%rsi), %eax
+; SSE2-NEXT: movd %eax, %xmm13
+; SSE2-NEXT: andl $15, %edx
+; SSE2-NEXT: movzbl -24(%rsp,%rdx), %eax
+; SSE2-NEXT: movd %eax, %xmm14
+; SSE2-NEXT: andl $15, %ecx
+; SSE2-NEXT: movzbl -24(%rsp,%rcx), %eax
+; SSE2-NEXT: movd %eax, %xmm15
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE2-NEXT: andl $15, %eax
; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
@@ -918,85 +918,85 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr
; SSSE3-NEXT: pushq %rbx
; SSSE3-NEXT: movzbl (%rdi), %eax
; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSSE3-NEXT: movzbl 1(%rdi), %r9d
-; SSSE3-NEXT: movzbl 2(%rdi), %r10d
-; SSSE3-NEXT: movzbl 3(%rdi), %r11d
-; SSSE3-NEXT: movzbl 4(%rdi), %r14d
-; SSSE3-NEXT: movzbl 5(%rdi), %r15d
-; SSSE3-NEXT: movzbl 6(%rdi), %r12d
-; SSSE3-NEXT: movzbl 7(%rdi), %r13d
+; SSSE3-NEXT: movzbl 1(%rdi), %ecx
+; SSSE3-NEXT: movzbl 2(%rdi), %edx
+; SSSE3-NEXT: movzbl 3(%rdi), %esi
+; SSSE3-NEXT: movzbl 4(%rdi), %r8d
+; SSSE3-NEXT: movzbl 5(%rdi), %r9d
+; SSSE3-NEXT: movzbl 6(%rdi), %r10d
+; SSSE3-NEXT: movzbl 7(%rdi), %r11d
; SSSE3-NEXT: movzbl 8(%rdi), %ebx
-; SSSE3-NEXT: movzbl 9(%rdi), %r8d
-; SSSE3-NEXT: movzbl 10(%rdi), %ecx
-; SSSE3-NEXT: movzbl 11(%rdi), %edx
-; SSSE3-NEXT: movzbl 12(%rdi), %esi
+; SSSE3-NEXT: movzbl 9(%rdi), %r14d
+; SSSE3-NEXT: movzbl 10(%rdi), %r15d
+; SSSE3-NEXT: movzbl 11(%rdi), %r12d
+; SSSE3-NEXT: movzbl 12(%rdi), %r13d
; SSSE3-NEXT: movzbl 13(%rdi), %ebp
; SSSE3-NEXT: movzbl 14(%rdi), %eax
; SSSE3-NEXT: movzbl 15(%rdi), %edi
; SSSE3-NEXT: andl $15, %edi
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movzbl -24(%rsp,%rdi), %edi
-; SSSE3-NEXT: movd %edi, %xmm8
+; SSSE3-NEXT: movd %edi, %xmm1
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm15
+; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: andl $15, %ebp
; SSSE3-NEXT: movzbl -24(%rsp,%rbp), %eax
-; SSSE3-NEXT: movd %eax, %xmm9
-; SSSE3-NEXT: andl $15, %esi
-; SSSE3-NEXT: movzbl -24(%rsp,%rsi), %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: andl $15, %edx
-; SSSE3-NEXT: movzbl -24(%rsp,%rdx), %eax
-; SSSE3-NEXT: movd %eax, %xmm10
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %eax
-; SSSE3-NEXT: movd %eax, %xmm7
-; SSSE3-NEXT: andl $15, %r8d
-; SSSE3-NEXT: movzbl -24(%rsp,%r8), %eax
-; SSSE3-NEXT: movd %eax, %xmm11
-; SSSE3-NEXT: andl $15, %ebx
-; SSSE3-NEXT: movzbl -24(%rsp,%rbx), %eax
-; SSSE3-NEXT: movd %eax, %xmm6
+; SSSE3-NEXT: movd %eax, %xmm4
; SSSE3-NEXT: andl $15, %r13d
; SSSE3-NEXT: movzbl -24(%rsp,%r13), %eax
-; SSSE3-NEXT: movd %eax, %xmm12
+; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: andl $15, %r12d
; SSSE3-NEXT: movzbl -24(%rsp,%r12), %eax
; SSSE3-NEXT: movd %eax, %xmm5
; SSSE3-NEXT: andl $15, %r15d
; SSSE3-NEXT: movzbl -24(%rsp,%r15), %eax
-; SSSE3-NEXT: movd %eax, %xmm13
+; SSSE3-NEXT: movd %eax, %xmm7
; SSSE3-NEXT: andl $15, %r14d
; SSSE3-NEXT: movzbl -24(%rsp,%r14), %eax
-; SSSE3-NEXT: movd %eax, %xmm4
+; SSSE3-NEXT: movd %eax, %xmm8
+; SSSE3-NEXT: andl $15, %ebx
+; SSSE3-NEXT: movzbl -24(%rsp,%rbx), %eax
+; SSSE3-NEXT: movd %eax, %xmm6
; SSSE3-NEXT: andl $15, %r11d
; SSSE3-NEXT: movzbl -24(%rsp,%r11), %eax
-; SSSE3-NEXT: movd %eax, %xmm14
+; SSSE3-NEXT: movd %eax, %xmm9
; SSSE3-NEXT: andl $15, %r10d
; SSSE3-NEXT: movzbl -24(%rsp,%r10), %eax
-; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: movd %eax, %xmm10
; SSSE3-NEXT: andl $15, %r9d
; SSSE3-NEXT: movzbl -24(%rsp,%r9), %eax
-; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: movd %eax, %xmm11
+; SSSE3-NEXT: andl $15, %r8d
+; SSSE3-NEXT: movzbl -24(%rsp,%r8), %eax
+; SSSE3-NEXT: movd %eax, %xmm12
+; SSSE3-NEXT: andl $15, %esi
+; SSSE3-NEXT: movzbl -24(%rsp,%rsi), %eax
+; SSSE3-NEXT: movd %eax, %xmm13
+; SSSE3-NEXT: andl $15, %edx
+; SSSE3-NEXT: movzbl -24(%rsp,%rdx), %eax
+; SSSE3-NEXT: movd %eax, %xmm14
+; SSSE3-NEXT: andl $15, %ecx
+; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %eax
+; SSSE3-NEXT: movd %eax, %xmm15
; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSSE3-NEXT: andl $15, %eax
; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
@@ -1014,52 +1014,52 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr
; SSE41-NEXT: pushq %r13
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
-; SSE41-NEXT: movzbl (%rdi), %r9d
-; SSE41-NEXT: andl $15, %r9d
-; SSE41-NEXT: movzbl 1(%rdi), %ebx
-; SSE41-NEXT: movzbl 2(%rdi), %eax
+; SSE41-NEXT: movzbl (%rdi), %ecx
+; SSE41-NEXT: andl $15, %ecx
+; SSE41-NEXT: movzbl 1(%rdi), %eax
; SSE41-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE41-NEXT: movzbl 3(%rdi), %r11d
-; SSE41-NEXT: movzbl 4(%rdi), %r14d
-; SSE41-NEXT: movzbl 5(%rdi), %r15d
-; SSE41-NEXT: movzbl 6(%rdi), %r12d
-; SSE41-NEXT: movzbl 7(%rdi), %r13d
-; SSE41-NEXT: movzbl 8(%rdi), %r10d
-; SSE41-NEXT: movzbl 9(%rdi), %r8d
-; SSE41-NEXT: movzbl 10(%rdi), %ecx
-; SSE41-NEXT: movzbl 11(%rdi), %edx
-; SSE41-NEXT: movzbl 12(%rdi), %esi
+; SSE41-NEXT: movzbl 2(%rdi), %edx
+; SSE41-NEXT: movzbl 3(%rdi), %esi
+; SSE41-NEXT: movzbl 4(%rdi), %r8d
+; SSE41-NEXT: movzbl 5(%rdi), %r9d
+; SSE41-NEXT: movzbl 6(%rdi), %r10d
+; SSE41-NEXT: movzbl 7(%rdi), %r11d
+; SSE41-NEXT: movzbl 8(%rdi), %ebx
+; SSE41-NEXT: movzbl 9(%rdi), %r14d
+; SSE41-NEXT: movzbl 10(%rdi), %r15d
+; SSE41-NEXT: movzbl 11(%rdi), %r12d
+; SSE41-NEXT: movzbl 12(%rdi), %r13d
; SSE41-NEXT: movzbl 13(%rdi), %ebp
; SSE41-NEXT: movzbl 14(%rdi), %eax
; SSE41-NEXT: movzbl 15(%rdi), %edi
; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT: movzbl -24(%rsp,%r9), %r9d
-; SSE41-NEXT: movd %r9d, %xmm0
-; SSE41-NEXT: andl $15, %ebx
-; SSE41-NEXT: pinsrb $1, -24(%rsp,%rbx), %xmm0
-; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; SSE41-NEXT: andl $15, %ebx
-; SSE41-NEXT: pinsrb $2, -24(%rsp,%rbx), %xmm0
+; SSE41-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; SSE41-NEXT: movd %ecx, %xmm0
+; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE41-NEXT: andl $15, %ecx
+; SSE41-NEXT: pinsrb $1, -24(%rsp,%rcx), %xmm0
+; SSE41-NEXT: andl $15, %edx
+; SSE41-NEXT: pinsrb $2, -24(%rsp,%rdx), %xmm0
+; SSE41-NEXT: andl $15, %esi
+; SSE41-NEXT: pinsrb $3, -24(%rsp,%rsi), %xmm0
+; SSE41-NEXT: andl $15, %r8d
+; SSE41-NEXT: pinsrb $4, -24(%rsp,%r8), %xmm0
+; SSE41-NEXT: andl $15, %r9d
+; SSE41-NEXT: pinsrb $5, -24(%rsp,%r9), %xmm0
+; SSE41-NEXT: andl $15, %r10d
+; SSE41-NEXT: pinsrb $6, -24(%rsp,%r10), %xmm0
; SSE41-NEXT: andl $15, %r11d
-; SSE41-NEXT: pinsrb $3, -24(%rsp,%r11), %xmm0
+; SSE41-NEXT: pinsrb $7, -24(%rsp,%r11), %xmm0
+; SSE41-NEXT: andl $15, %ebx
+; SSE41-NEXT: pinsrb $8, -24(%rsp,%rbx), %xmm0
; SSE41-NEXT: andl $15, %r14d
-; SSE41-NEXT: pinsrb $4, -24(%rsp,%r14), %xmm0
+; SSE41-NEXT: pinsrb $9, -24(%rsp,%r14), %xmm0
; SSE41-NEXT: andl $15, %r15d
-; SSE41-NEXT: pinsrb $5, -24(%rsp,%r15), %xmm0
+; SSE41-NEXT: pinsrb $10, -24(%rsp,%r15), %xmm0
; SSE41-NEXT: andl $15, %r12d
-; SSE41-NEXT: pinsrb $6, -24(%rsp,%r12), %xmm0
+; SSE41-NEXT: pinsrb $11, -24(%rsp,%r12), %xmm0
; SSE41-NEXT: andl $15, %r13d
-; SSE41-NEXT: pinsrb $7, -24(%rsp,%r13), %xmm0
-; SSE41-NEXT: andl $15, %r10d
-; SSE41-NEXT: pinsrb $8, -24(%rsp,%r10), %xmm0
-; SSE41-NEXT: andl $15, %r8d
-; SSE41-NEXT: pinsrb $9, -24(%rsp,%r8), %xmm0
-; SSE41-NEXT: andl $15, %ecx
-; SSE41-NEXT: pinsrb $10, -24(%rsp,%rcx), %xmm0
-; SSE41-NEXT: andl $15, %edx
-; SSE41-NEXT: pinsrb $11, -24(%rsp,%rdx), %xmm0
-; SSE41-NEXT: andl $15, %esi
-; SSE41-NEXT: pinsrb $12, -24(%rsp,%rsi), %xmm0
+; SSE41-NEXT: pinsrb $12, -24(%rsp,%r13), %xmm0
; SSE41-NEXT: andl $15, %ebp
; SSE41-NEXT: pinsrb $13, -24(%rsp,%rbp), %xmm0
; SSE41-NEXT: andl $15, %eax
@@ -1082,52 +1082,52 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr
; AVX-NEXT: pushq %r13
; AVX-NEXT: pushq %r12
; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movzbl (%rdi), %r9d
-; AVX-NEXT: andl $15, %r9d
-; AVX-NEXT: movzbl 1(%rdi), %ebx
-; AVX-NEXT: movzbl 2(%rdi), %eax
+; AVX-NEXT: movzbl (%rdi), %ecx
+; AVX-NEXT: andl $15, %ecx
+; AVX-NEXT: movzbl 1(%rdi), %eax
; AVX-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX-NEXT: movzbl 3(%rdi), %r11d
-; AVX-NEXT: movzbl 4(%rdi), %r14d
-; AVX-NEXT: movzbl 5(%rdi), %r15d
-; AVX-NEXT: movzbl 6(%rdi), %r12d
-; AVX-NEXT: movzbl 7(%rdi), %r13d
-; AVX-NEXT: movzbl 8(%rdi), %r10d
-; AVX-NEXT: movzbl 9(%rdi), %r8d
-; AVX-NEXT: movzbl 10(%rdi), %ecx
-; AVX-NEXT: movzbl 11(%rdi), %edx
-; AVX-NEXT: movzbl 12(%rdi), %esi
+; AVX-NEXT: movzbl 2(%rdi), %edx
+; AVX-NEXT: movzbl 3(%rdi), %esi
+; AVX-NEXT: movzbl 4(%rdi), %r8d
+; AVX-NEXT: movzbl 5(%rdi), %r9d
+; AVX-NEXT: movzbl 6(%rdi), %r10d
+; AVX-NEXT: movzbl 7(%rdi), %r11d
+; AVX-NEXT: movzbl 8(%rdi), %ebx
+; AVX-NEXT: movzbl 9(%rdi), %r14d
+; AVX-NEXT: movzbl 10(%rdi), %r15d
+; AVX-NEXT: movzbl 11(%rdi), %r12d
+; AVX-NEXT: movzbl 12(%rdi), %r13d
; AVX-NEXT: movzbl 13(%rdi), %ebp
; AVX-NEXT: movzbl 14(%rdi), %eax
; AVX-NEXT: movzbl 15(%rdi), %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movzbl -24(%rsp,%r9), %r9d
-; AVX-NEXT: vmovd %r9d, %xmm0
-; AVX-NEXT: andl $15, %ebx
-; AVX-NEXT: vpinsrb $1, -24(%rsp,%rbx), %xmm0, %xmm0
-; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX-NEXT: andl $15, %ebx
-; AVX-NEXT: vpinsrb $2, -24(%rsp,%rbx), %xmm0, %xmm0
+; AVX-NEXT: movzbl -24(%rsp,%rcx), %ecx
+; AVX-NEXT: vmovd %ecx, %xmm0
+; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX-NEXT: andl $15, %ecx
+; AVX-NEXT: vpinsrb $1, -24(%rsp,%rcx), %xmm0, %xmm0
+; AVX-NEXT: andl $15, %edx
+; AVX-NEXT: vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0
+; AVX-NEXT: andl $15, %esi
+; AVX-NEXT: vpinsrb $3, -24(%rsp,%rsi), %xmm0, %xmm0
+; AVX-NEXT: andl $15, %r8d
+; AVX-NEXT: vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0
+; AVX-NEXT: andl $15, %r9d
+; AVX-NEXT: vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0
+; AVX-NEXT: andl $15, %r10d
+; AVX-NEXT: vpinsrb $6, -24(%rsp,%r10), %xmm0, %xmm0
; AVX-NEXT: andl $15, %r11d
-; AVX-NEXT: vpinsrb $3, -24(%rsp,%r11), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $7, -24(%rsp,%r11), %xmm0, %xmm0
+; AVX-NEXT: andl $15, %ebx
+; AVX-NEXT: vpinsrb $8, -24(%rsp,%rbx), %xmm0, %xmm0
; AVX-NEXT: andl $15, %r14d
-; AVX-NEXT: vpinsrb $4, -24(%rsp,%r14), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $9, -24(%rsp,%r14), %xmm0, %xmm0
; AVX-NEXT: andl $15, %r15d
-; AVX-NEXT: vpinsrb $5, -24(%rsp,%r15), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $10, -24(%rsp,%r15), %xmm0, %xmm0
; AVX-NEXT: andl $15, %r12d
-; AVX-NEXT: vpinsrb $6, -24(%rsp,%r12), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $11, -24(%rsp,%r12), %xmm0, %xmm0
; AVX-NEXT: andl $15, %r13d
-; AVX-NEXT: vpinsrb $7, -24(%rsp,%r13), %xmm0, %xmm0
-; AVX-NEXT: andl $15, %r10d
-; AVX-NEXT: vpinsrb $8, -24(%rsp,%r10), %xmm0, %xmm0
-; AVX-NEXT: andl $15, %r8d
-; AVX-NEXT: vpinsrb $9, -24(%rsp,%r8), %xmm0, %xmm0
-; AVX-NEXT: andl $15, %ecx
-; AVX-NEXT: vpinsrb $10, -24(%rsp,%rcx), %xmm0, %xmm0
-; AVX-NEXT: andl $15, %edx
-; AVX-NEXT: vpinsrb $11, -24(%rsp,%rdx), %xmm0, %xmm0
-; AVX-NEXT: andl $15, %esi
-; AVX-NEXT: vpinsrb $12, -24(%rsp,%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, -24(%rsp,%r13), %xmm0, %xmm0
; AVX-NEXT: andl $15, %ebp
; AVX-NEXT: vpinsrb $13, -24(%rsp,%rbp), %xmm0, %xmm0
; AVX-NEXT: andl $15, %eax
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
index 4695a8d9a73cd..f3bafec3399a7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -189,10 +189,10 @@ define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0
; ALL-NEXT: # kill: def $edx killed $edx def $rdx
; ALL-NEXT: # kill: def $esi killed $esi def $rsi
; ALL-NEXT: # kill: def $edi killed $edi def $rdi
-; ALL-NEXT: movl 24(%rbp), %r10d
-; ALL-NEXT: andl $7, %r10d
-; ALL-NEXT: movl 16(%rbp), %eax
+; ALL-NEXT: movl 24(%rbp), %eax
; ALL-NEXT: andl $7, %eax
+; ALL-NEXT: movl 16(%rbp), %r10d
+; ALL-NEXT: andl $7, %r10d
; ALL-NEXT: andl $7, %edi
; ALL-NEXT: andl $7, %esi
; ALL-NEXT: andl $7, %edx
@@ -240,10 +240,10 @@ define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0
; ALL-NEXT: # kill: def $edx killed $edx def $rdx
; ALL-NEXT: # kill: def $esi killed $esi def $rsi
; ALL-NEXT: # kill: def $edi killed $edi def $rdi
-; ALL-NEXT: movl {{[0-9]+}}(%rsp), %r10d
-; ALL-NEXT: andl $3, %r10d
; ALL-NEXT: movl {{[0-9]+}}(%rsp), %eax
; ALL-NEXT: andl $3, %eax
+; ALL-NEXT: movl {{[0-9]+}}(%rsp), %r10d
+; ALL-NEXT: andl $3, %r10d
; ALL-NEXT: andl $3, %edi
; ALL-NEXT: andl $3, %esi
; ALL-NEXT: andl $3, %edx
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index b81695545cfe8..1b772f97469f6 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -2308,7 +2308,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5
@@ -2320,18 +2320,18 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255]
-; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255]
+; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3
-; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
+; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm3
; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0
; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index 588f5377041d8..7e916561826a7 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -321,110 +321,110 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
; SSE2-LABEL: trunc_packus_v4i64_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
-; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm5
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v4i64_v4i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm6, %xmm6
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
-; SSSE3-NEXT: movdqa %xmm3, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm5
+; SSSE3-NEXT: pandn %xmm3, %xmm5
; SSSE3-NEXT: por %xmm5, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm4
; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v4i64_v4i32:
@@ -589,265 +589,265 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; SSE2-LABEL: trunc_packus_v8i64_v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm3
-; SSE2-NEXT: movdqa 16(%rdi), %xmm4
+; SSE2-NEXT: movdqa 16(%rdi), %xmm8
; SSE2-NEXT: movdqa 32(%rdi), %xmm6
-; SSE2-NEXT: movdqa 48(%rdi), %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa 48(%rdi), %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483647,2147483647]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm10, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm10
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm8, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm8
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: por %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm6, %xmm8
+; SSE2-NEXT: pxor %xmm0, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm6, %xmm8
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pxor %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm0, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i64_v8i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa (%rdi), %xmm3
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm4
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm8
; SSSE3-NEXT: movdqa 32(%rdi), %xmm6
-; SSSE3-NEXT: movdqa 48(%rdi), %xmm10
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm11, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483647,2147483647]
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm7, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
+; SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm6, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm6, %xmm4
-; SSSE3-NEXT: movdqa %xmm10, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm10
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm10, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: movdqa %xmm8, %xmm3
+; SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm8
+; SSSE3-NEXT: pandn %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm6, %xmm8
+; SSSE3-NEXT: pxor %xmm0, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm6
+; SSSE3-NEXT: pandn %xmm4, %xmm8
+; SSSE3-NEXT: por %xmm6, %xmm8
+; SSSE3-NEXT: movdqa %xmm1, %xmm6
+; SSSE3-NEXT: pxor %xmm0, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: pand %xmm1, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pand %xmm5, %xmm4
+; SSSE3-NEXT: movdqa %xmm8, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm1
+; SSSE3-NEXT: pand %xmm8, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm0, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pand %xmm3, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm0
; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i64_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm5
-; SSE41-NEXT: movdqa 16(%rdi), %xmm4
-; SSE41-NEXT: movdqa 32(%rdi), %xmm10
-; SSE41-NEXT: movdqa 48(%rdi), %xmm9
+; SSE41-NEXT: movdqa 16(%rdi), %xmm8
+; SSE41-NEXT: movdqa 32(%rdi), %xmm7
+; SSE41-NEXT: movdqa 48(%rdi), %xmm2
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [4294967295,4294967295]
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647]
-; SSE41-NEXT: movdqa %xmm2, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647]
+; SSE41-NEXT: movdqa %xmm6, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: movdqa %xmm6, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
-; SSE41-NEXT: movdqa %xmm10, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
+; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm4
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: xorpd %xmm2, %xmm2
; SSE41-NEXT: movapd %xmm1, %xmm6
; SSE41-NEXT: xorpd %xmm3, %xmm6
; SSE41-NEXT: movapd %xmm6, %xmm7
@@ -858,7 +858,7 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
-; SSE41-NEXT: movapd %xmm4, %xmm1
+; SSE41-NEXT: movapd %xmm8, %xmm1
; SSE41-NEXT: xorpd %xmm3, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm7
; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
@@ -867,35 +867,35 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
-; SSE41-NEXT: movapd %xmm5, %xmm4
-; SSE41-NEXT: xorpd %xmm3, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
-; SSE41-NEXT: movapd %xmm8, %xmm5
-; SSE41-NEXT: xorpd %xmm3, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v8i64_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; SSE41-NEXT: xorpd %xmm3, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
+; SSE41-NEXT: movapd %xmm4, %xmm5
+; SSE41-NEXT: xorpd %xmm3, %xmm5
+; SSE41-NEXT: movapd %xmm5, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: trunc_packus_v8i64_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4294967295,4294967295]
@@ -1368,46 +1368,46 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) {
; SSE2-LABEL: trunc_packus_v4i64_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm4
@@ -1428,46 +1428,46 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) {
;
; SSSE3-LABEL: trunc_packus_v4i64_v4i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
-; SSSE3-NEXT: movdqa %xmm4, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm5, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm3
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm0, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm4
@@ -1620,46 +1620,46 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) {
define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; SSE2-LABEL: trunc_packus_v4i64_v4i16_store:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm4
@@ -1681,46 +1681,46 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
;
; SSSE3-LABEL: trunc_packus_v4i64_v4i16_store:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
-; SSSE3-NEXT: movdqa %xmm4, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm5, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm3
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm0, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm4
@@ -1879,115 +1879,115 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" {
; SSE2-LABEL: trunc_packus_v8i64_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm4
+; SSE2-NEXT: movdqa (%rdi), %xmm8
; SSE2-NEXT: movdqa 16(%rdi), %xmm2
-; SSE2-NEXT: movdqa 32(%rdi), %xmm10
+; SSE2-NEXT: movdqa 32(%rdi), %xmm3
; SSE2-NEXT: movdqa 48(%rdi), %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm10, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm10
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm10, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm5
+; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm8
+; SSE2-NEXT: pandn %xmm4, %xmm2
+; SSE2-NEXT: por %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm6, %xmm8
+; SSE2-NEXT: pxor %xmm0, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm6, %xmm8
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pxor %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pand %xmm9, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm4
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm0, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
@@ -1997,115 +1997,115 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
;
; SSSE3-LABEL: trunc_packus_v8i64_v8i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa (%rdi), %xmm4
+; SSSE3-NEXT: movdqa (%rdi), %xmm8
; SSSE3-NEXT: movdqa 16(%rdi), %xmm2
-; SSSE3-NEXT: movdqa 32(%rdi), %xmm10
+; SSSE3-NEXT: movdqa 32(%rdi), %xmm3
; SSSE3-NEXT: movdqa 48(%rdi), %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183]
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm7, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183]
+; SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm1
+; SSSE3-NEXT: pandn %xmm4, %xmm1
; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm6, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm6, %xmm4
-; SSSE3-NEXT: movdqa %xmm10, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm10
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm10, %xmm5
-; SSSE3-NEXT: movdqa %xmm5, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5
+; SSSE3-NEXT: movdqa %xmm8, %xmm2
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm8
+; SSSE3-NEXT: pandn %xmm4, %xmm2
+; SSSE3-NEXT: por %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm6, %xmm8
+; SSSE3-NEXT: pxor %xmm0, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm6
+; SSSE3-NEXT: pandn %xmm4, %xmm8
+; SSSE3-NEXT: por %xmm6, %xmm8
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: pxor %xmm0, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pand %xmm9, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm3, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm3
+; SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: movdqa %xmm8, %xmm4
+; SSSE3-NEXT: pxor %xmm0, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm4
+; SSSE3-NEXT: pand %xmm8, %xmm4
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm0, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pand %xmm2, %xmm6
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
@@ -2115,67 +2115,67 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
;
; SSE41-LABEL: trunc_packus_v8i64_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm10
-; SSE41-NEXT: movdqa 16(%rdi), %xmm9
-; SSE41-NEXT: movdqa 32(%rdi), %xmm3
-; SSE41-NEXT: movdqa 48(%rdi), %xmm5
+; SSE41-NEXT: movdqa (%rdi), %xmm7
+; SSE41-NEXT: movdqa 16(%rdi), %xmm5
+; SSE41-NEXT: movdqa 32(%rdi), %xmm4
+; SSE41-NEXT: movdqa 48(%rdi), %xmm8
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [65535,65535]
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8
-; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183]
+; SSE41-NEXT: movdqa %xmm6, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
-; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
-; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: movdqa %xmm6, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: xorpd %xmm2, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
+; SSE41-NEXT: xorpd %xmm5, %xmm5
+; SSE41-NEXT: movapd %xmm1, %xmm6
+; SSE41-NEXT: xorpd %xmm2, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm7
; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: movapd %xmm3, %xmm1
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
+; SSE41-NEXT: movapd %xmm8, %xmm1
; SSE41-NEXT: xorpd %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm7
; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
@@ -2184,28 +2184,28 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: packusdw %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm6, %xmm3
-; SSE41-NEXT: xorpd %xmm2, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
+; SSE41-NEXT: packusdw %xmm6, %xmm1
+; SSE41-NEXT: movapd %xmm4, %xmm6
+; SSE41-NEXT: xorpd %xmm2, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3
-; SSE41-NEXT: movapd %xmm8, %xmm4
; SSE41-NEXT: xorpd %xmm2, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE41-NEXT: movapd %xmm4, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
-; SSE41-NEXT: packusdw %xmm3, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE41-NEXT: packusdw %xmm6, %xmm5
; SSE41-NEXT: packusdw %xmm5, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -3019,57 +3019,57 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) {
define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
; SSE2-LABEL: trunc_packus_v4i64_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm5
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -3077,35 +3077,35 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
;
; SSSE3-LABEL: trunc_packus_v4i64_v4i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm4, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm5, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm3
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm5, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm4
@@ -3275,105 +3275,105 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE2-LABEL: trunc_packus_v4i64_v4i8_store:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm5
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
-; SSE2-NEXT: movd %xmm2, (%rdi)
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pand %xmm1, %xmm5
+; SSE2-NEXT: packuswb %xmm3, %xmm5
+; SSE2-NEXT: packuswb %xmm5, %xmm5
+; SSE2-NEXT: packuswb %xmm5, %xmm5
+; SSE2-NEXT: movd %xmm5, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v4i64_v4i8_store:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm4, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm5, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm3
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm0, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm4
@@ -3539,110 +3539,110 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; SSE2-LABEL: trunc_packus_v8i64_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm5
-; SSE2-NEXT: movdqa 16(%rdi), %xmm10
+; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movdqa 32(%rdi), %xmm3
-; SSE2-NEXT: movdqa 48(%rdi), %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa 48(%rdi), %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm10, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm8, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm8
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: por %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm5, %xmm8
+; SSE2-NEXT: pxor %xmm1, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm5, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm8, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm5, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: packuswb %xmm4, %xmm3
+; SSE2-NEXT: packuswb %xmm5, %xmm3
; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
@@ -3650,177 +3650,177 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; SSSE3-LABEL: trunc_packus_v8i64_v8i8:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa (%rdi), %xmm5
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm10
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
; SSSE3-NEXT: movdqa 32(%rdi), %xmm3
-; SSSE3-NEXT: movdqa 48(%rdi), %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm11, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm7, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
+; SSSE3-NEXT: movdqa %xmm6, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSSE3-NEXT: pxor %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm10, %xmm1
-; SSSE3-NEXT: pxor %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm8, %xmm3
+; SSSE3-NEXT: pxor %xmm1, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa %xmm6, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm8
+; SSSE3-NEXT: pandn %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm5, %xmm8
+; SSSE3-NEXT: pxor %xmm1, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa %xmm6, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pandn %xmm4, %xmm8
+; SSSE3-NEXT: por %xmm5, %xmm8
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm6
+; SSSE3-NEXT: por %xmm0, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm10
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm11, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm4
+; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: movdqa %xmm8, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm5
-; SSSE3-NEXT: pand %xmm0, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: packuswb %xmm5, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm11, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: packuswb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm1, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm11, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm3, %xmm5
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pxor %xmm1, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: packuswb %xmm4, %xmm3
+; SSSE3-NEXT: packuswb %xmm5, %xmm3
; SSSE3-NEXT: packuswb %xmm3, %xmm0
; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i64_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm10
-; SSE41-NEXT: movdqa 16(%rdi), %xmm9
-; SSE41-NEXT: movdqa 32(%rdi), %xmm3
-; SSE41-NEXT: movdqa 48(%rdi), %xmm5
+; SSE41-NEXT: movdqa (%rdi), %xmm7
+; SSE41-NEXT: movdqa 16(%rdi), %xmm5
+; SSE41-NEXT: movdqa 32(%rdi), %xmm4
+; SSE41-NEXT: movdqa 48(%rdi), %xmm8
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255]
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
+; SSE41-NEXT: movdqa %xmm6, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
+; SSE41-NEXT: xorpd %xmm5, %xmm5
; SSE41-NEXT: movapd %xmm1, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: xorpd %xmm2, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm7
+; SSE41-NEXT: xorpd %xmm2, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm7
; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: movapd %xmm3, %xmm1
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
+; SSE41-NEXT: movapd %xmm8, %xmm1
; SSE41-NEXT: xorpd %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm7
; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
@@ -3829,28 +3829,28 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: packusdw %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm6, %xmm3
-; SSE41-NEXT: xorpd %xmm2, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
+; SSE41-NEXT: packusdw %xmm6, %xmm1
+; SSE41-NEXT: movapd %xmm4, %xmm6
+; SSE41-NEXT: xorpd %xmm2, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3
-; SSE41-NEXT: movapd %xmm8, %xmm4
; SSE41-NEXT: xorpd %xmm2, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE41-NEXT: movapd %xmm4, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
-; SSE41-NEXT: packusdw %xmm3, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE41-NEXT: packusdw %xmm6, %xmm5
; SSE41-NEXT: packusdw %xmm5, %xmm1
; SSE41-NEXT: packuswb %xmm1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -3939,110 +3939,110 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; SSE2-LABEL: trunc_packus_v8i64_v8i8_store:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm5
-; SSE2-NEXT: movdqa 16(%rdi), %xmm10
+; SSE2-NEXT: movdqa 16(%rdi), %xmm3
; SSE2-NEXT: movdqa 32(%rdi), %xmm2
-; SSE2-NEXT: movdqa 48(%rdi), %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa 48(%rdi), %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm10, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm8
+; SSE2-NEXT: pandn %xmm4, %xmm2
+; SSE2-NEXT: por %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm8
+; SSE2-NEXT: pxor %xmm0, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm5, %xmm8
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm6
+; SSE2-NEXT: por %xmm3, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm10
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm8, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: packuswb %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm0, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: packuswb %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm4, %xmm2
+; SSE2-NEXT: packuswb %xmm5, %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm3
; SSE2-NEXT: packuswb %xmm3, %xmm3
; SSE2-NEXT: movq %xmm3, (%rsi)
@@ -4051,110 +4051,110 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; SSSE3-LABEL: trunc_packus_v8i64_v8i8_store:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa (%rdi), %xmm5
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm10
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm3
; SSSE3-NEXT: movdqa 32(%rdi), %xmm2
-; SSSE3-NEXT: movdqa 48(%rdi), %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm3, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm7, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
+; SSSE3-NEXT: movdqa %xmm6, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm1
+; SSSE3-NEXT: pandn %xmm4, %xmm1
; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm5, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm3, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm10, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: movdqa %xmm8, %xmm2
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa %xmm6, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm8
+; SSSE3-NEXT: pandn %xmm4, %xmm2
+; SSSE3-NEXT: por %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm5, %xmm8
+; SSSE3-NEXT: pxor %xmm0, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa %xmm6, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pandn %xmm4, %xmm8
+; SSSE3-NEXT: por %xmm5, %xmm8
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm0, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm6
+; SSSE3-NEXT: por %xmm3, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm3
+; SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm10
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: movdqa %xmm8, %xmm3
+; SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: pand %xmm3, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: packuswb %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: pand %xmm8, %xmm3
+; SSSE3-NEXT: packuswb %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm0, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm11, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm2, %xmm5
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: packuswb %xmm4, %xmm2
+; SSSE3-NEXT: packuswb %xmm5, %xmm2
; SSSE3-NEXT: packuswb %xmm2, %xmm3
; SSSE3-NEXT: packuswb %xmm3, %xmm3
; SSSE3-NEXT: movq %xmm3, (%rsi)
@@ -4162,67 +4162,67 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
;
; SSE41-LABEL: trunc_packus_v8i64_v8i8_store:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm10
-; SSE41-NEXT: movdqa 16(%rdi), %xmm9
-; SSE41-NEXT: movdqa 32(%rdi), %xmm2
-; SSE41-NEXT: movdqa 48(%rdi), %xmm5
+; SSE41-NEXT: movdqa (%rdi), %xmm7
+; SSE41-NEXT: movdqa 16(%rdi), %xmm5
+; SSE41-NEXT: movdqa 32(%rdi), %xmm3
+; SSE41-NEXT: movdqa 48(%rdi), %xmm8
; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255]
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903]
-; SSE41-NEXT: movdqa %xmm3, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
+; SSE41-NEXT: movdqa %xmm6, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm4, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
+; SSE41-NEXT: xorpd %xmm5, %xmm5
; SSE41-NEXT: movapd %xmm4, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm4, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: movapd %xmm4, %xmm3
-; SSE41-NEXT: xorpd %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm7
+; SSE41-NEXT: xorpd %xmm1, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm7
; SSE41-NEXT: pcmpeqd %xmm1, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: movapd %xmm2, %xmm4
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
+; SSE41-NEXT: movapd %xmm8, %xmm4
; SSE41-NEXT: xorpd %xmm1, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm7
; SSE41-NEXT: pcmpeqd %xmm1, %xmm7
@@ -4231,28 +4231,28 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE41-NEXT: packusdw %xmm3, %xmm4
-; SSE41-NEXT: movapd %xmm6, %xmm2
-; SSE41-NEXT: xorpd %xmm1, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT: packusdw %xmm6, %xmm4
+; SSE41-NEXT: movapd %xmm3, %xmm6
+; SSE41-NEXT: xorpd %xmm1, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
; SSE41-NEXT: movapd %xmm2, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2
-; SSE41-NEXT: movapd %xmm8, %xmm3
; SSE41-NEXT: xorpd %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE41-NEXT: movapd %xmm3, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm7
; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
-; SSE41-NEXT: packusdw %xmm2, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
+; SSE41-NEXT: packusdw %xmm6, %xmm5
; SSE41-NEXT: packusdw %xmm5, %xmm4
; SSE41-NEXT: packuswb %xmm4, %xmm4
; SSE41-NEXT: movq %xmm4, (%rsi)
@@ -4344,618 +4344,618 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256" {
; SSE2-LABEL: trunc_packus_v16i64_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm11
-; SSE2-NEXT: movdqa 16(%rdi), %xmm9
-; SSE2-NEXT: movdqa 32(%rdi), %xmm15
-; SSE2-NEXT: movdqa 48(%rdi), %xmm12
-; SSE2-NEXT: movdqa 80(%rdi), %xmm2
+; SSE2-NEXT: movdqa (%rdi), %xmm7
+; SSE2-NEXT: movdqa 16(%rdi), %xmm0
+; SSE2-NEXT: movdqa 32(%rdi), %xmm12
+; SSE2-NEXT: movdqa 48(%rdi), %xmm11
+; SSE2-NEXT: movdqa 80(%rdi), %xmm10
; SSE2-NEXT: movdqa 64(%rdi), %xmm5
-; SSE2-NEXT: movdqa 112(%rdi), %xmm3
-; SSE2-NEXT: movdqa 96(%rdi), %xmm14
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
+; SSE2-NEXT: movdqa 112(%rdi), %xmm4
+; SSE2-NEXT: movdqa 96(%rdi), %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm14, %xmm7
-; SSE2-NEXT: pxor %xmm1, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3]
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm13
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm8, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm6, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm13
+; SSE2-NEXT: movdqa %xmm8, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm13
-; SSE2-NEXT: pand %xmm13, %xmm14
-; SSE2-NEXT: pandn %xmm8, %xmm13
-; SSE2-NEXT: por %xmm14, %xmm13
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm14
-; SSE2-NEXT: pand %xmm14, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm14
-; SSE2-NEXT: por %xmm3, %xmm14
-; SSE2-NEXT: movdqa %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm13
+; SSE2-NEXT: movdqa %xmm8, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm4
+; SSE2-NEXT: pandn %xmm6, %xmm4
; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm15, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm15
-; SSE2-NEXT: pandn %xmm8, %xmm7
-; SSE2-NEXT: por %xmm15, %xmm7
-; SSE2-NEXT: movdqa %xmm12, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm15
-; SSE2-NEXT: pand %xmm15, %xmm12
-; SSE2-NEXT: pandn %xmm8, %xmm15
-; SSE2-NEXT: por %xmm12, %xmm15
-; SSE2-NEXT: movdqa %xmm11, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm6, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm12
+; SSE2-NEXT: movdqa %xmm10, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm13
+; SSE2-NEXT: movdqa %xmm8, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm10
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm12, %xmm10
+; SSE2-NEXT: pxor %xmm1, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm13
+; SSE2-NEXT: movdqa %xmm8, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm12
+; SSE2-NEXT: pandn %xmm6, %xmm10
+; SSE2-NEXT: por %xmm12, %xmm10
+; SSE2-NEXT: movdqa %xmm11, %xmm12
+; SSE2-NEXT: pxor %xmm1, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm13
+; SSE2-NEXT: movdqa %xmm8, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm12, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm12
; SSE2-NEXT: pand %xmm12, %xmm11
-; SSE2-NEXT: pandn %xmm8, %xmm12
+; SSE2-NEXT: pandn %xmm6, %xmm12
; SSE2-NEXT: por %xmm11, %xmm12
-; SSE2-NEXT: movdqa %xmm9, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm9
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm9, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm7, %xmm11
+; SSE2-NEXT: pxor %xmm1, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm13
+; SSE2-NEXT: movdqa %xmm8, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm11
+; SSE2-NEXT: por %xmm7, %xmm11
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: pxor %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm13
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm8
+; SSE2-NEXT: por %xmm0, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pand %xmm7, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm2, %xmm6
-; SSE2-NEXT: movdqa %xmm12, %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: movdqa %xmm11, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm12, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm0
+; SSE2-NEXT: pand %xmm11, %xmm0
; SSE2-NEXT: packuswb %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm15, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm15, %xmm3
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: movdqa %xmm12, %xmm6
+; SSE2-NEXT: pxor %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: movdqa %xmm10, %xmm6
+; SSE2-NEXT: pxor %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: pand %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: packuswb %xmm7, %xmm8
+; SSE2-NEXT: packuswb %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: packuswb %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm6
-; SSE2-NEXT: pand %xmm7, %xmm6
-; SSE2-NEXT: packuswb %xmm3, %xmm6
-; SSE2-NEXT: packuswb %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm6
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm14, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm14, %xmm4
-; SSE2-NEXT: movdqa %xmm13, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm13, %xmm3
-; SSE2-NEXT: packuswb %xmm4, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm5
+; SSE2-NEXT: packuswb %xmm5, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v16i64_v16i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa (%rdi), %xmm11
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm9
-; SSSE3-NEXT: movdqa 32(%rdi), %xmm15
-; SSSE3-NEXT: movdqa 48(%rdi), %xmm12
-; SSSE3-NEXT: movdqa 80(%rdi), %xmm2
+; SSSE3-NEXT: movdqa (%rdi), %xmm7
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
+; SSSE3-NEXT: movdqa 32(%rdi), %xmm12
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm11
+; SSSE3-NEXT: movdqa 80(%rdi), %xmm10
; SSSE3-NEXT: movdqa 64(%rdi), %xmm5
-; SSSE3-NEXT: movdqa 112(%rdi), %xmm3
-; SSSE3-NEXT: movdqa 96(%rdi), %xmm14
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
+; SSSE3-NEXT: movdqa 112(%rdi), %xmm4
+; SSSE3-NEXT: movdqa 96(%rdi), %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [255,255]
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm14, %xmm7
-; SSSE3-NEXT: pxor %xmm1, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm10, %xmm10
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm6, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm0, %xmm7
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm9, %xmm9
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903]
+; SSSE3-NEXT: movdqa %xmm8, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pandn %xmm6, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: pxor %xmm1, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13
+; SSSE3-NEXT: movdqa %xmm8, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm4
+; SSSE3-NEXT: pandn %xmm6, %xmm3
+; SSSE3-NEXT: por %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm5, %xmm4
+; SSSE3-NEXT: pxor %xmm1, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm13
-; SSSE3-NEXT: pand %xmm13, %xmm14
-; SSSE3-NEXT: pandn %xmm8, %xmm13
-; SSSE3-NEXT: por %xmm14, %xmm13
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4
-; SSSE3-NEXT: movdqa %xmm6, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm14
-; SSSE3-NEXT: pand %xmm14, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm14
-; SSSE3-NEXT: por %xmm3, %xmm14
-; SSSE3-NEXT: movdqa %xmm5, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm6, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13
+; SSSE3-NEXT: movdqa %xmm8, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm4
+; SSSE3-NEXT: pandn %xmm6, %xmm4
; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm6, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm15, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm6, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm7
-; SSSE3-NEXT: pand %xmm7, %xmm15
-; SSSE3-NEXT: pandn %xmm8, %xmm7
-; SSSE3-NEXT: por %xmm15, %xmm7
-; SSSE3-NEXT: movdqa %xmm12, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm6, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm15
-; SSSE3-NEXT: pand %xmm15, %xmm12
-; SSSE3-NEXT: pandn %xmm8, %xmm15
-; SSSE3-NEXT: por %xmm12, %xmm15
-; SSSE3-NEXT: movdqa %xmm11, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm6, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm12
+; SSSE3-NEXT: movdqa %xmm10, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13
+; SSSE3-NEXT: movdqa %xmm8, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm10
+; SSSE3-NEXT: pandn %xmm6, %xmm5
+; SSSE3-NEXT: por %xmm10, %xmm5
+; SSSE3-NEXT: movdqa %xmm12, %xmm10
+; SSSE3-NEXT: pxor %xmm1, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13
+; SSSE3-NEXT: movdqa %xmm8, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm10, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm10
+; SSSE3-NEXT: pand %xmm10, %xmm12
+; SSSE3-NEXT: pandn %xmm6, %xmm10
+; SSSE3-NEXT: por %xmm12, %xmm10
+; SSSE3-NEXT: movdqa %xmm11, %xmm12
+; SSSE3-NEXT: pxor %xmm1, %xmm12
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13
+; SSSE3-NEXT: movdqa %xmm8, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm12, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm12
; SSSE3-NEXT: pand %xmm12, %xmm11
-; SSSE3-NEXT: pandn %xmm8, %xmm12
+; SSSE3-NEXT: pandn %xmm6, %xmm12
; SSSE3-NEXT: por %xmm11, %xmm12
-; SSSE3-NEXT: movdqa %xmm9, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm9
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm9, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm7, %xmm11
+; SSSE3-NEXT: pxor %xmm1, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13
+; SSSE3-NEXT: movdqa %xmm8, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm11, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm11
+; SSSE3-NEXT: pand %xmm11, %xmm7
+; SSSE3-NEXT: pandn %xmm6, %xmm11
+; SSSE3-NEXT: por %xmm7, %xmm11
+; SSSE3-NEXT: movdqa %xmm0, %xmm7
+; SSSE3-NEXT: pxor %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: pandn %xmm6, %xmm8
+; SSSE3-NEXT: por %xmm0, %xmm8
+; SSSE3-NEXT: movdqa %xmm8, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: pand %xmm7, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm6
-; SSSE3-NEXT: pand %xmm2, %xmm6
-; SSSE3-NEXT: movdqa %xmm12, %xmm0
+; SSSE3-NEXT: pand %xmm8, %xmm6
+; SSSE3-NEXT: movdqa %xmm11, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm0, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: pand %xmm12, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm0
+; SSSE3-NEXT: pand %xmm11, %xmm0
; SSSE3-NEXT: packuswb %xmm6, %xmm0
-; SSSE3-NEXT: movdqa %xmm15, %xmm2
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm15, %xmm3
-; SSSE3-NEXT: movdqa %xmm7, %xmm2
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: movdqa %xmm12, %xmm6
+; SSSE3-NEXT: pxor %xmm1, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: pand %xmm12, %xmm7
+; SSSE3-NEXT: movdqa %xmm10, %xmm6
+; SSSE3-NEXT: pxor %xmm1, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pand %xmm9, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm8
+; SSSE3-NEXT: pand %xmm10, %xmm8
+; SSSE3-NEXT: packuswb %xmm7, %xmm8
+; SSSE3-NEXT: packuswb %xmm8, %xmm0
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm1, %xmm6
+; SSSE3-NEXT: movdqa %xmm6, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: pand %xmm5, %xmm7
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm8, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pand %xmm8, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm5
+; SSSE3-NEXT: pand %xmm4, %xmm5
+; SSSE3-NEXT: packuswb %xmm7, %xmm5
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm1, %xmm4
+; SSSE3-NEXT: movdqa %xmm4, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm6
-; SSSE3-NEXT: pand %xmm7, %xmm6
-; SSSE3-NEXT: packuswb %xmm3, %xmm6
-; SSSE3-NEXT: packuswb %xmm6, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm2
-; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm4, %xmm6
+; SSSE3-NEXT: pand %xmm3, %xmm6
; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm4, %xmm2
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm14, %xmm3
; SSSE3-NEXT: pxor %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pand %xmm14, %xmm4
-; SSSE3-NEXT: movdqa %xmm13, %xmm3
-; SSSE3-NEXT: pxor %xmm1, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm13, %xmm3
-; SSSE3-NEXT: packuswb %xmm4, %xmm3
-; SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: packuswb %xmm6, %xmm3
+; SSSE3-NEXT: packuswb %xmm3, %xmm5
+; SSSE3-NEXT: packuswb %xmm5, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v16i64_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm10
-; SSE41-NEXT: movdqa 16(%rdi), %xmm9
-; SSE41-NEXT: movdqa 32(%rdi), %xmm14
-; SSE41-NEXT: movdqa 48(%rdi), %xmm12
-; SSE41-NEXT: movdqa 80(%rdi), %xmm15
+; SSE41-NEXT: movdqa (%rdi), %xmm8
+; SSE41-NEXT: movdqa 16(%rdi), %xmm7
+; SSE41-NEXT: movdqa 32(%rdi), %xmm12
+; SSE41-NEXT: movdqa 48(%rdi), %xmm11
+; SSE41-NEXT: movdqa 80(%rdi), %xmm10
; SSE41-NEXT: movdqa 64(%rdi), %xmm6
-; SSE41-NEXT: movdqa 112(%rdi), %xmm13
+; SSE41-NEXT: movdqa 112(%rdi), %xmm5
; SSE41-NEXT: movdqa 96(%rdi), %xmm4
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255]
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903]
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm7, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
-; SSE41-NEXT: movdqa %xmm13, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm7, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm11
-; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm11
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm7, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm13
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm13
-; SSE41-NEXT: movdqa %xmm15, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm7, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm6
-; SSE41-NEXT: movdqa %xmm14, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
+; SSE41-NEXT: movdqa %xmm9, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm7, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm15
-; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm15
-; SSE41-NEXT: movdqa %xmm12, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm4
+; SSE41-NEXT: movdqa %xmm9, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm7, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4
-; SSE41-NEXT: movdqa %xmm10, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm5
+; SSE41-NEXT: movdqa %xmm9, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5
-; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5
+; SSE41-NEXT: movdqa %xmm10, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm9
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: xorpd %xmm2, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm5, %xmm1
+; SSE41-NEXT: movdqa %xmm9, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6
+; SSE41-NEXT: movdqa %xmm12, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm10
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm10
+; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10
+; SSE41-NEXT: movdqa %xmm11, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm12
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm12
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT: pand %xmm12, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm12
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12
+; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm11
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm11
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
+; SSE41-NEXT: xorpd %xmm7, %xmm7
+; SSE41-NEXT: movapd %xmm1, %xmm8
+; SSE41-NEXT: xorpd %xmm2, %xmm8
+; SSE41-NEXT: movapd %xmm8, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pxor %xmm8, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
+; SSE41-NEXT: movapd %xmm11, %xmm1
; SSE41-NEXT: xorpd %xmm2, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
+; SSE41-NEXT: movapd %xmm1, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm9
; SSE41-NEXT: pcmpgtd %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: pand %xmm9, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: packusdw %xmm3, %xmm1
-; SSE41-NEXT: movapd %xmm4, %xmm3
-; SSE41-NEXT: xorpd %xmm2, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: movapd %xmm15, %xmm4
-; SSE41-NEXT: xorpd %xmm2, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1
+; SSE41-NEXT: packusdw %xmm8, %xmm1
+; SSE41-NEXT: movapd %xmm12, %xmm8
+; SSE41-NEXT: xorpd %xmm2, %xmm8
+; SSE41-NEXT: movapd %xmm8, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pxor %xmm8, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8
+; SSE41-NEXT: movapd %xmm10, %xmm9
+; SSE41-NEXT: xorpd %xmm2, %xmm9
+; SSE41-NEXT: movapd %xmm9, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm11
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pxor %xmm9, %xmm9
+; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm9
+; SSE41-NEXT: packusdw %xmm8, %xmm9
+; SSE41-NEXT: packusdw %xmm9, %xmm1
+; SSE41-NEXT: movapd %xmm6, %xmm8
+; SSE41-NEXT: xorpd %xmm2, %xmm8
+; SSE41-NEXT: movapd %xmm8, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pxor %xmm8, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8
+; SSE41-NEXT: movapd %xmm5, %xmm6
+; SSE41-NEXT: xorpd %xmm2, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
+; SSE41-NEXT: packusdw %xmm8, %xmm6
; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm4
-; SSE41-NEXT: packusdw %xmm3, %xmm4
-; SSE41-NEXT: packusdw %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm6, %xmm3
-; SSE41-NEXT: xorpd %xmm2, %xmm3
+; SSE41-NEXT: xorpd %xmm2, %xmm5
+; SSE41-NEXT: movapd %xmm5, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4
-; SSE41-NEXT: movapd %xmm13, %xmm3
-; SSE41-NEXT: xorpd %xmm2, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm3
-; SSE41-NEXT: packusdw %xmm4, %xmm3
-; SSE41-NEXT: movapd %xmm11, %xmm4
; SSE41-NEXT: xorpd %xmm2, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
+; SSE41-NEXT: movapd %xmm4, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm8
; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: pand %xmm8, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4
-; SSE41-NEXT: movapd %xmm8, %xmm5
-; SSE41-NEXT: xorpd %xmm2, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9
-; SSE41-NEXT: packusdw %xmm4, %xmm9
-; SSE41-NEXT: packusdw %xmm9, %xmm3
-; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT: packusdw %xmm5, %xmm7
+; SSE41-NEXT: packusdw %xmm7, %xmm6
+; SSE41-NEXT: packuswb %xmm6, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -4964,10 +4964,10 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm8
+; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm9
+; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3
@@ -4977,39 +4977,39 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; AVX1-NEXT: vmovdqa (%rdi), %xmm5
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm7, %xmm2, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm5, %xmm2, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm7, %xmm2, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm8, %xmm2, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm2, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm7
-; AVX1-NEXT: vpand %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm7
-; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm5
-; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm5
-; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm2
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm9, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9
; AVX1-NEXT: vpand %xmm2, %xmm9, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
-; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9
+; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5
+; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm8
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpackusdw %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm5
+; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm5
+; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm4
+; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm4
+; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v16i64_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index 63cbe5e517d64..154d797584722 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -309,122 +309,122 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
; SSE2-LABEL: trunc_ssat_v4i64_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm5
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968]
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320]
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSE2-NEXT: pand %xmm4, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm8, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: pcmpgtd %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_ssat_v4i64_v4i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm6, %xmm6
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; SSSE3-NEXT: movdqa %xmm3, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm5
+; SSSE3-NEXT: pandn %xmm3, %xmm5
; SSSE3-NEXT: por %xmm5, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm4
; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968]
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; SSSE3-NEXT: movdqa %xmm5, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320]
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pand %xmm4, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: por %xmm8, %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm5
+; SSSE3-NEXT: pandn %xmm1, %xmm3
+; SSSE3-NEXT: por %xmm5, %xmm3
; SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pand %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pand %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pandn %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v4i64_v4i32:
@@ -580,325 +580,325 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256"
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm3
; SSE2-NEXT: movdqa 16(%rdi), %xmm5
-; SSE2-NEXT: movdqa 32(%rdi), %xmm11
-; SSE2-NEXT: movdqa 48(%rdi), %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSE2-NEXT: movdqa 32(%rdi), %xmm7
+; SSE2-NEXT: movdqa 48(%rdi), %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647]
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm5, %xmm3
; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm11
-; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
; SSE2-NEXT: por %xmm11, %xmm5
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm10
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm10, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744069414584320,18446744069414584320]
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm5, %xmm7
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: pxor %xmm0, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm1, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968]
+; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320]
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm8
+; SSE2-NEXT: pandn %xmm4, %xmm9
+; SSE2-NEXT: por %xmm8, %xmm9
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm1
+; SSE2-NEXT: por %xmm10, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_ssat_v8i64_v8i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa (%rdi), %xmm3
; SSSE3-NEXT: movdqa 16(%rdi), %xmm5
-; SSSE3-NEXT: movdqa 32(%rdi), %xmm11
-; SSSE3-NEXT: movdqa 48(%rdi), %xmm10
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSSE3-NEXT: movdqa 32(%rdi), %xmm7
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647]
; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm8, %xmm8
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
+; SSSE3-NEXT: movdqa %xmm6, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm5, %xmm3
; SSSE3-NEXT: pxor %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa %xmm6, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm3
; SSSE3-NEXT: por %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm11, %xmm4
-; SSSE3-NEXT: pxor %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm11
-; SSSE3-NEXT: pandn %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm7, %xmm5
+; SSSE3-NEXT: pxor %xmm0, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa %xmm6, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
; SSSE3-NEXT: por %xmm11, %xmm5
-; SSSE3-NEXT: movdqa %xmm10, %xmm4
-; SSSE3-NEXT: pxor %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm10
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm10, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744069414584320,18446744069414584320]
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pand %xmm5, %xmm7
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm7, %xmm5
+; SSSE3-NEXT: movdqa %xmm1, %xmm7
+; SSSE3-NEXT: pxor %xmm0, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm1
+; SSSE3-NEXT: pandn %xmm4, %xmm8
+; SSSE3-NEXT: por %xmm1, %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968]
+; SSSE3-NEXT: movdqa %xmm8, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320]
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm9
+; SSSE3-NEXT: pand %xmm9, %xmm8
+; SSSE3-NEXT: pandn %xmm4, %xmm9
+; SSSE3-NEXT: por %xmm8, %xmm9
; SSSE3-NEXT: movdqa %xmm5, %xmm1
; SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm10
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm1
+; SSSE3-NEXT: por %xmm10, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm1
+; SSSE3-NEXT: pandn %xmm4, %xmm1
; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2]
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm0, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm3, %xmm5
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pand %xmm3, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm0
; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v8i64_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm5
-; SSE41-NEXT: movdqa 16(%rdi), %xmm4
-; SSE41-NEXT: movdqa 32(%rdi), %xmm10
-; SSE41-NEXT: movdqa 48(%rdi), %xmm9
+; SSE41-NEXT: movdqa 16(%rdi), %xmm8
+; SSE41-NEXT: movdqa 32(%rdi), %xmm7
+; SSE41-NEXT: movdqa 48(%rdi), %xmm2
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [2147483647,2147483647]
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa %xmm2, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
+; SSE41-NEXT: movdqa %xmm6, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: movdqa %xmm6, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm11
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm11
-; SSE41-NEXT: movdqa %xmm10, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
+; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm4
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
; SSE41-NEXT: movapd %xmm1, %xmm7
; SSE41-NEXT: xorpd %xmm3, %xmm7
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320]
-; SSE41-NEXT: movapd %xmm7, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE41-NEXT: movapd %xmm7, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: pand %xmm9, %xmm0
; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movapd %xmm4, %xmm1
+; SSE41-NEXT: movapd %xmm2, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
+; SSE41-NEXT: movapd %xmm8, %xmm1
; SSE41-NEXT: xorpd %xmm3, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT: movapd %xmm1, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
; SSE41-NEXT: pcmpgtd %xmm6, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: pand %xmm9, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; SSE41-NEXT: movapd %xmm11, %xmm4
-; SSE41-NEXT: xorpd %xmm3, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4
-; SSE41-NEXT: xorpd %xmm8, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
+; SSE41-NEXT: movapd %xmm5, %xmm7
+; SSE41-NEXT: xorpd %xmm3, %xmm7
+; SSE41-NEXT: movapd %xmm7, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
+; SSE41-NEXT: xorpd %xmm4, %xmm3
; SSE41-NEXT: movapd %xmm3, %xmm5
; SSE41-NEXT: pcmpeqd %xmm6, %xmm5
; SSE41-NEXT: pcmpgtd %xmm6, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -1360,123 +1360,123 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) {
; SSE2-LABEL: trunc_ssat_v4i64_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147516415,2147516415]
-; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm5
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848]
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200]
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSE2-NEXT: pand %xmm4, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm8, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: pcmpgtd %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: packssdw %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: packssdw %xmm3, %xmm0
; SSE2-NEXT: packssdw %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_ssat_v4i64_v4i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSSE3-NEXT: pxor %xmm6, %xmm6
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147516415,2147516415]
-; SSSE3-NEXT: movdqa %xmm3, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm5
+; SSSE3-NEXT: pandn %xmm3, %xmm5
; SSSE3-NEXT: por %xmm5, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm4
; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848]
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; SSSE3-NEXT: movdqa %xmm5, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200]
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pand %xmm4, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: por %xmm8, %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm5
+; SSSE3-NEXT: pandn %xmm1, %xmm3
+; SSSE3-NEXT: por %xmm5, %xmm3
; SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pand %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: packssdw %xmm4, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pand %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pandn %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: packssdw %xmm3, %xmm0
; SSSE3-NEXT: packssdw %xmm0, %xmm0
; SSSE3-NEXT: retq
;
@@ -1604,126 +1604,126 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) {
define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; SSE2-LABEL: trunc_ssat_v4i64_v4i16_store:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415]
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709518848,18446744073709518848]
+; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200]
; SSE2-NEXT: pcmpgtd %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
+; SSE2-NEXT: pand %xmm4, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: por %xmm8, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: pcmpgtd %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: packssdw %xmm0, %xmm0
-; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: packssdw %xmm1, %xmm2
+; SSE2-NEXT: packssdw %xmm2, %xmm2
+; SSE2-NEXT: movq %xmm2, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_ssat_v4i64_v4i16_store:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415]
-; SSSE3-NEXT: movdqa %xmm4, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm5, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm3
; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848]
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709518848,18446744073709518848]
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200]
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pand %xmm4, %xmm8
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
+; SSSE3-NEXT: por %xmm8, %xmm1
+; SSSE3-NEXT: pand %xmm1, %xmm5
+; SSSE3-NEXT: pandn %xmm0, %xmm1
+; SSSE3-NEXT: por %xmm5, %xmm1
; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pand %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSSE3-NEXT: packssdw %xmm0, %xmm0
-; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pand %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
+; SSSE3-NEXT: packssdw %xmm1, %xmm2
+; SSSE3-NEXT: packssdw %xmm2, %xmm2
+; SSSE3-NEXT: movq %xmm2, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v4i64_v4i16_store:
@@ -1855,329 +1855,329 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" {
; SSE2-LABEL: trunc_ssat_v8i64_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm11
-; SSE2-NEXT: movdqa 16(%rdi), %xmm10
+; SSE2-NEXT: movdqa (%rdi), %xmm6
+; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movdqa 32(%rdi), %xmm3
; SSE2-NEXT: movdqa 48(%rdi), %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147516415,2147516415]
-; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415]
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm5, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm11
-; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
; SSE2-NEXT: por %xmm11, %xmm5
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562035200,18446744071562035200]
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pxor %xmm1, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm0, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848]
+; SSE2-NEXT: movdqa %xmm8, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200]
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm8
+; SSE2-NEXT: pandn %xmm4, %xmm9
+; SSE2-NEXT: por %xmm8, %xmm9
; SSE2-NEXT: movdqa %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: por %xmm10, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: packssdw %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: packssdw %xmm9, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: packssdw %xmm4, %xmm1
+; SSE2-NEXT: packssdw %xmm5, %xmm1
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_ssat_v8i64_v8i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa (%rdi), %xmm11
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm10
+; SSSE3-NEXT: movdqa (%rdi), %xmm6
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
; SSSE3-NEXT: movdqa 32(%rdi), %xmm3
; SSSE3-NEXT: movdqa 48(%rdi), %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767]
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147516415,2147516415]
-; SSSE3-NEXT: movdqa %xmm0, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm8, %xmm8
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415]
+; SSSE3-NEXT: movdqa %xmm7, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm5, %xmm3
; SSSE3-NEXT: pxor %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa %xmm7, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm3
; SSSE3-NEXT: por %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm11, %xmm4
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm11
-; SSSE3-NEXT: pandn %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm6, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa %xmm7, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
; SSSE3-NEXT: por %xmm11, %xmm5
-; SSSE3-NEXT: movdqa %xmm10, %xmm4
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm10
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm10, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848]
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562035200,18446744071562035200]
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm0, %xmm4
+; SSSE3-NEXT: pand %xmm5, %xmm6
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm6
+; SSSE3-NEXT: pxor %xmm1, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm8
+; SSSE3-NEXT: por %xmm0, %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848]
+; SSSE3-NEXT: movdqa %xmm8, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200]
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm9
+; SSSE3-NEXT: pand %xmm9, %xmm8
+; SSSE3-NEXT: pandn %xmm4, %xmm9
+; SSSE3-NEXT: por %xmm8, %xmm9
; SSSE3-NEXT: movdqa %xmm5, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm10
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
+; SSSE3-NEXT: por %xmm10, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm0
; SSSE3-NEXT: por %xmm5, %xmm0
-; SSSE3-NEXT: packssdw %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: packssdw %xmm9, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm3, %xmm5
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pand %xmm3, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm6, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm1
+; SSSE3-NEXT: pandn %xmm4, %xmm1
; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: packssdw %xmm4, %xmm1
+; SSSE3-NEXT: packssdw %xmm5, %xmm1
; SSSE3-NEXT: packssdw %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v8i64_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm10
-; SSE41-NEXT: movdqa 16(%rdi), %xmm9
-; SSE41-NEXT: movdqa 32(%rdi), %xmm3
-; SSE41-NEXT: movdqa 48(%rdi), %xmm5
+; SSE41-NEXT: movdqa (%rdi), %xmm7
+; SSE41-NEXT: movdqa 16(%rdi), %xmm5
+; SSE41-NEXT: movdqa 32(%rdi), %xmm4
+; SSE41-NEXT: movdqa 48(%rdi), %xmm8
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [32767,32767]
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415]
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147516415,2147516415]
+; SSE41-NEXT: movdqa %xmm6, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm11
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm11
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709518848,18446744073709518848]
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: xorpd %xmm2, %xmm4
+; SSE41-NEXT: movapd %xmm1, %xmm7
+; SSE41-NEXT: xorpd %xmm2, %xmm7
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200]
-; SSE41-NEXT: movapd %xmm4, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm5, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: movapd %xmm3, %xmm1
+; SSE41-NEXT: movapd %xmm7, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm5, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
+; SSE41-NEXT: movapd %xmm8, %xmm1
; SSE41-NEXT: xorpd %xmm2, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT: movapd %xmm1, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
; SSE41-NEXT: pcmpgtd %xmm6, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: pand %xmm9, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: packssdw %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm11, %xmm3
-; SSE41-NEXT: xorpd %xmm2, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm5, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3
-; SSE41-NEXT: xorpd %xmm8, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
+; SSE41-NEXT: packssdw %xmm7, %xmm1
+; SSE41-NEXT: movapd %xmm4, %xmm7
+; SSE41-NEXT: xorpd %xmm2, %xmm7
+; SSE41-NEXT: movapd %xmm7, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm5, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7
+; SSE41-NEXT: xorpd %xmm3, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm4
; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
-; SSE41-NEXT: packssdw %xmm3, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE41-NEXT: packssdw %xmm7, %xmm5
; SSE41-NEXT: packssdw %xmm5, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -2739,36 +2739,36 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) {
define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
; SSE2-LABEL: trunc_ssat_v4i64_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [127,127]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775]
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
@@ -2776,64 +2776,64 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840]
; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: por %xmm8, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: pcmpgtd %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_ssat_v4i64_v4i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [127,127]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775]
-; SSSE3-NEXT: movdqa %xmm4, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm5, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm3
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm5, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
@@ -2841,28 +2841,28 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840]
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pand %xmm5, %xmm8
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: por %xmm8, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm4
+; SSSE3-NEXT: pandn %xmm1, %xmm4
; SSSE3-NEXT: por %xmm4, %xmm0
; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pand %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm3, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pand %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pandn %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v4i64_v4i8:
@@ -2996,65 +2996,65 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE2-LABEL: trunc_ssat_v4i64_v4i8_store:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [127,127]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775]
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT: movdqa %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840]
; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm4, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm8, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: pcmpgtd %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movd %xmm0, (%rdi)
@@ -3062,65 +3062,65 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
;
; SSSE3-LABEL: trunc_ssat_v4i64_v4i8_store:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [127,127]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775]
-; SSSE3-NEXT: movdqa %xmm4, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: pxor %xmm6, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm5, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm3
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm0, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709551488,18446744073709551488]
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840]
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pand %xmm4, %xmm8
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
+; SSSE3-NEXT: por %xmm8, %xmm1
+; SSSE3-NEXT: pand %xmm1, %xmm5
+; SSSE3-NEXT: pandn %xmm0, %xmm1
+; SSSE3-NEXT: por %xmm5, %xmm1
; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pand %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSSE3-NEXT: pand %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pshufb %xmm0, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: movd %xmm1, (%rdi)
; SSSE3-NEXT: retq
;
@@ -3260,331 +3260,331 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" {
; SSE2-LABEL: trunc_ssat_v8i64_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm11
-; SSE2-NEXT: movdqa 16(%rdi), %xmm10
+; SSE2-NEXT: movdqa (%rdi), %xmm6
+; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movdqa 32(%rdi), %xmm3
; SSE2-NEXT: movdqa 48(%rdi), %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [127,127]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483775,2147483775]
-; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775]
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm5, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm11
-; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
; SSE2-NEXT: por %xmm11, %xmm5
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pxor %xmm1, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm0, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT: movdqa %xmm8, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840]
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm8
+; SSE2-NEXT: pandn %xmm4, %xmm9
+; SSE2-NEXT: por %xmm8, %xmm9
; SSE2-NEXT: movdqa %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: por %xmm10, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: packssdw %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: packssdw %xmm9, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: packssdw %xmm4, %xmm1
+; SSE2-NEXT: packssdw %xmm5, %xmm1
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_ssat_v8i64_v8i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa (%rdi), %xmm11
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm10
+; SSSE3-NEXT: movdqa (%rdi), %xmm6
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
; SSSE3-NEXT: movdqa 32(%rdi), %xmm3
; SSSE3-NEXT: movdqa 48(%rdi), %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [127,127]
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483775,2147483775]
-; SSSE3-NEXT: movdqa %xmm0, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm8, %xmm8
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775]
+; SSSE3-NEXT: movdqa %xmm7, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm5, %xmm3
; SSSE3-NEXT: pxor %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa %xmm7, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm3
; SSSE3-NEXT: por %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm11, %xmm4
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm11
-; SSSE3-NEXT: pandn %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm6, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa %xmm7, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
; SSSE3-NEXT: por %xmm11, %xmm5
-; SSSE3-NEXT: movdqa %xmm10, %xmm4
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm10
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm10, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840]
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm0, %xmm4
+; SSSE3-NEXT: pand %xmm5, %xmm6
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm6
+; SSSE3-NEXT: pxor %xmm1, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm8
+; SSSE3-NEXT: por %xmm0, %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; SSSE3-NEXT: movdqa %xmm8, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840]
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm9
+; SSSE3-NEXT: pand %xmm9, %xmm8
+; SSSE3-NEXT: pandn %xmm4, %xmm9
+; SSSE3-NEXT: por %xmm8, %xmm9
; SSSE3-NEXT: movdqa %xmm5, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm10
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
+; SSSE3-NEXT: por %xmm10, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm0
; SSSE3-NEXT: por %xmm5, %xmm0
-; SSSE3-NEXT: packssdw %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: packssdw %xmm9, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm3, %xmm5
; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pand %xmm3, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm6, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm1
+; SSSE3-NEXT: pandn %xmm4, %xmm1
; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: packssdw %xmm4, %xmm1
+; SSSE3-NEXT: packssdw %xmm5, %xmm1
; SSSE3-NEXT: packssdw %xmm1, %xmm0
; SSSE3-NEXT: packsswb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v8i64_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm10
-; SSE41-NEXT: movdqa 16(%rdi), %xmm9
-; SSE41-NEXT: movdqa 32(%rdi), %xmm3
-; SSE41-NEXT: movdqa 48(%rdi), %xmm5
+; SSE41-NEXT: movdqa (%rdi), %xmm7
+; SSE41-NEXT: movdqa 16(%rdi), %xmm5
+; SSE41-NEXT: movdqa 32(%rdi), %xmm4
+; SSE41-NEXT: movdqa 48(%rdi), %xmm8
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127]
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775]
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775]
+; SSE41-NEXT: movdqa %xmm6, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm11
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm11
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488]
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: xorpd %xmm2, %xmm4
+; SSE41-NEXT: movapd %xmm1, %xmm7
+; SSE41-NEXT: xorpd %xmm2, %xmm7
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840]
-; SSE41-NEXT: movapd %xmm4, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm5, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: movapd %xmm3, %xmm1
+; SSE41-NEXT: movapd %xmm7, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm5, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
+; SSE41-NEXT: movapd %xmm8, %xmm1
; SSE41-NEXT: xorpd %xmm2, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT: movapd %xmm1, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
; SSE41-NEXT: pcmpgtd %xmm6, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: pand %xmm9, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: packssdw %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm11, %xmm3
-; SSE41-NEXT: xorpd %xmm2, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm5, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3
-; SSE41-NEXT: xorpd %xmm8, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
+; SSE41-NEXT: packssdw %xmm7, %xmm1
+; SSE41-NEXT: movapd %xmm4, %xmm7
+; SSE41-NEXT: xorpd %xmm2, %xmm7
+; SSE41-NEXT: movapd %xmm7, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm5, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7
+; SSE41-NEXT: xorpd %xmm3, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm4
; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
-; SSE41-NEXT: packssdw %xmm3, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE41-NEXT: packssdw %xmm7, %xmm5
; SSE41-NEXT: packssdw %xmm5, %xmm1
; SSE41-NEXT: packsswb %xmm1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -3672,117 +3672,117 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" {
define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-width"="256" {
; SSE2-LABEL: trunc_ssat_v8i64_v8i8_store:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm11
-; SSE2-NEXT: movdqa 16(%rdi), %xmm10
+; SSE2-NEXT: movdqa (%rdi), %xmm6
+; SSE2-NEXT: movdqa 16(%rdi), %xmm3
; SSE2-NEXT: movdqa 32(%rdi), %xmm2
; SSE2-NEXT: movdqa 48(%rdi), %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [127,127]
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775]
-; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm12
-; SSE2-NEXT: pand %xmm12, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm12
-; SSE2-NEXT: por %xmm2, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775]
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm5, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm2
; SSE2-NEXT: por %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm11
-; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
; SSE2-NEXT: por %xmm11, %xmm5
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm10
-; SSE2-NEXT: pandn %xmm8, %xmm7
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pxor %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm7
+; SSE2-NEXT: por %xmm3, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pxor %xmm0, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840]
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pandn %xmm3, %xmm8
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: movdqa %xmm5, %xmm7
+; SSE2-NEXT: pxor %xmm0, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm10, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSE2-NEXT: movdqa %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm7
; SSE2-NEXT: pand %xmm7, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm7
+; SSE2-NEXT: pandn %xmm3, %xmm7
; SSE2-NEXT: por %xmm5, %xmm7
-; SSE2-NEXT: packssdw %xmm3, %xmm7
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm12, %xmm0
+; SSE2-NEXT: packssdw %xmm8, %xmm7
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm12
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm12, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: packssdw %xmm5, %xmm0
; SSE2-NEXT: packssdw %xmm0, %xmm7
; SSE2-NEXT: packsswb %xmm7, %xmm7
; SSE2-NEXT: movq %xmm7, (%rsi)
@@ -3790,117 +3790,117 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt
;
; SSSE3-LABEL: trunc_ssat_v8i64_v8i8_store:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa (%rdi), %xmm11
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm10
+; SSSE3-NEXT: movdqa (%rdi), %xmm6
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm3
; SSSE3-NEXT: movdqa 32(%rdi), %xmm2
; SSSE3-NEXT: movdqa 48(%rdi), %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [127,127]
; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775]
-; SSSE3-NEXT: movdqa %xmm3, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm12
-; SSSE3-NEXT: pand %xmm12, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm12
-; SSSE3-NEXT: por %xmm2, %xmm12
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm8, %xmm8
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775]
+; SSSE3-NEXT: movdqa %xmm7, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm1
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm1
+; SSSE3-NEXT: por %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm5, %xmm2
; SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa %xmm3, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa %xmm7, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm2
; SSSE3-NEXT: por %xmm5, %xmm2
-; SSSE3-NEXT: movdqa %xmm11, %xmm4
-; SSSE3-NEXT: pxor %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm11
-; SSSE3-NEXT: pandn %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm6, %xmm5
+; SSSE3-NEXT: pxor %xmm0, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: movdqa %xmm7, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3]
; SSSE3-NEXT: por %xmm11, %xmm5
-; SSSE3-NEXT: movdqa %xmm10, %xmm4
-; SSSE3-NEXT: pxor %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm7
-; SSSE3-NEXT: pand %xmm7, %xmm10
-; SSSE3-NEXT: pandn %xmm8, %xmm7
+; SSSE3-NEXT: pand %xmm5, %xmm6
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: pxor %xmm0, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm3
+; SSSE3-NEXT: pandn %xmm4, %xmm7
+; SSSE3-NEXT: por %xmm3, %xmm7
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pxor %xmm0, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840]
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm7
+; SSSE3-NEXT: pandn %xmm3, %xmm8
+; SSSE3-NEXT: por %xmm7, %xmm8
+; SSSE3-NEXT: movdqa %xmm5, %xmm7
+; SSSE3-NEXT: pxor %xmm0, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; SSSE3-NEXT: por %xmm10, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSSE3-NEXT: movdqa %xmm7, %xmm3
-; SSSE3-NEXT: pxor %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840]
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm7
; SSSE3-NEXT: pand %xmm7, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm7
+; SSSE3-NEXT: pandn %xmm3, %xmm7
; SSSE3-NEXT: por %xmm5, %xmm7
-; SSSE3-NEXT: packssdw %xmm3, %xmm7
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm12, %xmm0
+; SSSE3-NEXT: packssdw %xmm8, %xmm7
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm0, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: por %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pand %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm12
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm12, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm0
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: packssdw %xmm5, %xmm0
; SSSE3-NEXT: packssdw %xmm0, %xmm7
; SSSE3-NEXT: packsswb %xmm7, %xmm7
; SSSE3-NEXT: movq %xmm7, (%rsi)
@@ -3908,97 +3908,97 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt
;
; SSE41-LABEL: trunc_ssat_v8i64_v8i8_store:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm10
-; SSE41-NEXT: movdqa 16(%rdi), %xmm9
-; SSE41-NEXT: movdqa 32(%rdi), %xmm2
-; SSE41-NEXT: movdqa 48(%rdi), %xmm5
+; SSE41-NEXT: movdqa (%rdi), %xmm7
+; SSE41-NEXT: movdqa 16(%rdi), %xmm5
+; SSE41-NEXT: movdqa 32(%rdi), %xmm3
+; SSE41-NEXT: movdqa 48(%rdi), %xmm8
; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127]
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775]
-; SSE41-NEXT: movdqa %xmm3, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775]
+; SSE41-NEXT: movdqa %xmm6, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm4, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm4, %xmm11
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm11
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm4, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488]
-; SSE41-NEXT: movapd %xmm4, %xmm3
-; SSE41-NEXT: xorpd %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm4, %xmm7
+; SSE41-NEXT: xorpd %xmm1, %xmm7
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840]
-; SSE41-NEXT: movapd %xmm3, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm5, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: movapd %xmm2, %xmm4
+; SSE41-NEXT: movapd %xmm7, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm5, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7
+; SSE41-NEXT: movapd %xmm8, %xmm4
; SSE41-NEXT: xorpd %xmm1, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT: movapd %xmm4, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: pand %xmm9, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE41-NEXT: packssdw %xmm3, %xmm4
-; SSE41-NEXT: movapd %xmm11, %xmm2
-; SSE41-NEXT: xorpd %xmm1, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm5, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm2
-; SSE41-NEXT: xorpd %xmm8, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
+; SSE41-NEXT: packssdw %xmm7, %xmm4
+; SSE41-NEXT: movapd %xmm3, %xmm7
+; SSE41-NEXT: xorpd %xmm1, %xmm7
+; SSE41-NEXT: movapd %xmm7, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm5, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT: xorpd %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm3
; SSE41-NEXT: pcmpeqd %xmm6, %xmm3
; SSE41-NEXT: pcmpgtd %xmm6, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
-; SSE41-NEXT: packssdw %xmm2, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
+; SSE41-NEXT: packssdw %xmm7, %xmm5
; SSE41-NEXT: packssdw %xmm5, %xmm4
; SSE41-NEXT: packsswb %xmm4, %xmm4
; SSE41-NEXT: movq %xmm4, (%rsi)
@@ -4088,638 +4088,638 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt
define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256" {
; SSE2-LABEL: trunc_ssat_v16i64_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm11
-; SSE2-NEXT: movdqa 16(%rdi), %xmm9
-; SSE2-NEXT: movdqa 32(%rdi), %xmm14
-; SSE2-NEXT: movdqa 48(%rdi), %xmm12
+; SSE2-NEXT: movdqa (%rdi), %xmm8
+; SSE2-NEXT: movdqa 16(%rdi), %xmm0
+; SSE2-NEXT: movdqa 32(%rdi), %xmm12
+; SSE2-NEXT: movdqa 48(%rdi), %xmm11
; SSE2-NEXT: movdqa 80(%rdi), %xmm7
-; SSE2-NEXT: movdqa 64(%rdi), %xmm2
+; SSE2-NEXT: movdqa 64(%rdi), %xmm5
; SSE2-NEXT: movdqa 112(%rdi), %xmm4
; SSE2-NEXT: movdqa 96(%rdi), %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
-; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm15, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [127,127]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3]
; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483775,2147483775]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm13
-; SSE2-NEXT: pand %xmm13, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm13
-; SSE2-NEXT: por %xmm3, %xmm13
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm15, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm13
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
+; SSE2-NEXT: movdqa %xmm9, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm6, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm13
+; SSE2-NEXT: movdqa %xmm9, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm6, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm15, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm7, %xmm0
-; SSE2-NEXT: pxor %xmm15, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm13
+; SSE2-NEXT: movdqa %xmm9, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm13
+; SSE2-NEXT: movdqa %xmm9, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm6, %xmm5
; SSE2-NEXT: por %xmm7, %xmm5
-; SSE2-NEXT: movdqa %xmm14, %xmm0
-; SSE2-NEXT: pxor %xmm15, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm14
-; SSE2-NEXT: pandn %xmm8, %xmm7
-; SSE2-NEXT: por %xmm14, %xmm7
-; SSE2-NEXT: movdqa %xmm12, %xmm0
-; SSE2-NEXT: pxor %xmm15, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm14
-; SSE2-NEXT: pand %xmm14, %xmm12
-; SSE2-NEXT: pandn %xmm8, %xmm14
-; SSE2-NEXT: por %xmm12, %xmm14
-; SSE2-NEXT: movdqa %xmm11, %xmm0
-; SSE2-NEXT: pxor %xmm15, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm12
+; SSE2-NEXT: movdqa %xmm12, %xmm7
+; SSE2-NEXT: pxor %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm13
+; SSE2-NEXT: movdqa %xmm9, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm12
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: por %xmm12, %xmm7
+; SSE2-NEXT: movdqa %xmm11, %xmm12
+; SSE2-NEXT: pxor %xmm1, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm13
+; SSE2-NEXT: movdqa %xmm9, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm12, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm12
; SSE2-NEXT: pand %xmm12, %xmm11
-; SSE2-NEXT: pandn %xmm8, %xmm12
+; SSE2-NEXT: pandn %xmm6, %xmm12
; SSE2-NEXT: por %xmm11, %xmm12
-; SSE2-NEXT: movdqa %xmm9, %xmm0
-; SSE2-NEXT: pxor %xmm15, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm9
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm9, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm15, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm9
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm11
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm11, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm12, %xmm0
-; SSE2-NEXT: pxor %xmm15, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm8, %xmm11
+; SSE2-NEXT: pxor %xmm1, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm13
+; SSE2-NEXT: movdqa %xmm9, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm11, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm15
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3]
+; SSE2-NEXT: por %xmm15, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm8
+; SSE2-NEXT: pandn %xmm6, %xmm11
+; SSE2-NEXT: por %xmm8, %xmm11
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: pxor %xmm1, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm13
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm8, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm10
+; SSE2-NEXT: por %xmm0, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT: movdqa %xmm10, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm13
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm13, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm14, %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm10
+; SSE2-NEXT: pandn %xmm6, %xmm13
+; SSE2-NEXT: por %xmm10, %xmm13
+; SSE2-NEXT: movdqa %xmm11, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm14
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm12
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm12, %xmm0
-; SSE2-NEXT: packssdw %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm14, %xmm1
-; SSE2-NEXT: pxor %xmm15, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm14
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm14, %xmm1
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: pxor %xmm15, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm11, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: packssdw %xmm1, %xmm2
-; SSE2-NEXT: packssdw %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm15, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm15, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: packssdw %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm15, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: por %xmm14, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm11
+; SSE2-NEXT: pandn %xmm6, %xmm0
+; SSE2-NEXT: por %xmm11, %xmm0
+; SSE2-NEXT: packssdw %xmm13, %xmm0
+; SSE2-NEXT: movdqa %xmm12, %xmm10
+; SSE2-NEXT: pxor %xmm1, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm12
+; SSE2-NEXT: pandn %xmm6, %xmm10
+; SSE2-NEXT: por %xmm12, %xmm10
+; SSE2-NEXT: movdqa %xmm7, %xmm11
+; SSE2-NEXT: pxor %xmm1, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2]
+; SSE2-NEXT: pand %xmm12, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT: por %xmm13, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm11
+; SSE2-NEXT: por %xmm7, %xmm11
+; SSE2-NEXT: packssdw %xmm10, %xmm11
+; SSE2-NEXT: packssdw %xmm11, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm7
+; SSE2-NEXT: pxor %xmm1, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: packssdw %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm13, %xmm15
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm15
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm13
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm13, %xmm3
-; SSE2-NEXT: packssdw %xmm1, %xmm3
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: packsswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v16i64_v16i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa (%rdi), %xmm11
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm9
-; SSSE3-NEXT: movdqa 32(%rdi), %xmm14
-; SSSE3-NEXT: movdqa 48(%rdi), %xmm12
-; SSSE3-NEXT: movdqa 80(%rdi), %xmm7
-; SSSE3-NEXT: movdqa 64(%rdi), %xmm2
-; SSSE3-NEXT: movdqa 112(%rdi), %xmm4
-; SSSE3-NEXT: movdqa 96(%rdi), %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm15 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm15, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pxor %xmm10, %xmm10
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483775,2147483775]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm13
-; SSSE3-NEXT: pand %xmm13, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm13
-; SSSE3-NEXT: por %xmm3, %xmm13
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm15, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm15, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm7, %xmm0
-; SSSE3-NEXT: pxor %xmm15, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm7, %xmm5
-; SSSE3-NEXT: movdqa %xmm14, %xmm0
-; SSSE3-NEXT: pxor %xmm15, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm7
-; SSSE3-NEXT: pand %xmm7, %xmm14
-; SSSE3-NEXT: pandn %xmm8, %xmm7
-; SSSE3-NEXT: por %xmm14, %xmm7
-; SSSE3-NEXT: movdqa %xmm12, %xmm0
-; SSSE3-NEXT: pxor %xmm15, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm14
-; SSSE3-NEXT: pand %xmm14, %xmm12
-; SSSE3-NEXT: pandn %xmm8, %xmm14
-; SSSE3-NEXT: por %xmm12, %xmm14
-; SSSE3-NEXT: movdqa %xmm11, %xmm0
-; SSSE3-NEXT: pxor %xmm15, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm12
-; SSSE3-NEXT: pand %xmm12, %xmm11
-; SSSE3-NEXT: pandn %xmm8, %xmm12
-; SSSE3-NEXT: por %xmm11, %xmm12
-; SSSE3-NEXT: movdqa %xmm9, %xmm0
-; SSSE3-NEXT: pxor %xmm15, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm9
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm9, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm15, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm11
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840]
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm11, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm12, %xmm0
-; SSSE3-NEXT: pxor %xmm15, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm12
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm12, %xmm0
-; SSSE3-NEXT: packssdw %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm14, %xmm1
-; SSSE3-NEXT: pxor %xmm15, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm14
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm14, %xmm1
-; SSSE3-NEXT: movdqa %xmm7, %xmm2
-; SSSE3-NEXT: pxor %xmm15, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pand %xmm11, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: packssdw %xmm1, %xmm2
-; SSSE3-NEXT: packssdw %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm15, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm4, %xmm2
-; SSSE3-NEXT: pxor %xmm15, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm4, %xmm2
-; SSSE3-NEXT: packssdw %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm15, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm13, %xmm15
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm15
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,0,2,2]
+; SSE2-NEXT: por %xmm7, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: packssdw %xmm4, %xmm1
+; SSE2-NEXT: packssdw %xmm1, %xmm5
+; SSE2-NEXT: packsswb %xmm5, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc_ssat_v16i64_v16i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa (%rdi), %xmm8
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
+; SSSE3-NEXT: movdqa 32(%rdi), %xmm12
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm11
+; SSSE3-NEXT: movdqa 80(%rdi), %xmm7
+; SSSE3-NEXT: movdqa 64(%rdi), %xmm5
+; SSSE3-NEXT: movdqa 112(%rdi), %xmm4
+; SSSE3-NEXT: movdqa 96(%rdi), %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [127,127]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm10, %xmm10
+; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
+; SSSE3-NEXT: movdqa %xmm9, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pandn %xmm6, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: pxor %xmm1, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13
+; SSSE3-NEXT: movdqa %xmm9, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,3,3]
+; SSSE3-NEXT: pandn %xmm6, %xmm3
; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm13
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm13, %xmm3
-; SSSE3-NEXT: packssdw %xmm1, %xmm3
-; SSSE3-NEXT: packssdw %xmm3, %xmm2
-; SSSE3-NEXT: packsswb %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm5, %xmm4
+; SSSE3-NEXT: pxor %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13
+; SSSE3-NEXT: movdqa %xmm9, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm5
+; SSSE3-NEXT: pandn %xmm6, %xmm4
+; SSSE3-NEXT: por %xmm5, %xmm4
+; SSSE3-NEXT: movdqa %xmm7, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13
+; SSSE3-NEXT: movdqa %xmm9, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm7
+; SSSE3-NEXT: pandn %xmm6, %xmm5
+; SSSE3-NEXT: por %xmm7, %xmm5
+; SSSE3-NEXT: movdqa %xmm12, %xmm7
+; SSSE3-NEXT: pxor %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13
+; SSSE3-NEXT: movdqa %xmm9, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm12
+; SSSE3-NEXT: pandn %xmm6, %xmm7
+; SSSE3-NEXT: por %xmm12, %xmm7
+; SSSE3-NEXT: movdqa %xmm11, %xmm12
+; SSSE3-NEXT: pxor %xmm1, %xmm12
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13
+; SSSE3-NEXT: movdqa %xmm9, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm12, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm12
+; SSSE3-NEXT: pand %xmm12, %xmm11
+; SSSE3-NEXT: pandn %xmm6, %xmm12
+; SSSE3-NEXT: por %xmm11, %xmm12
+; SSSE3-NEXT: movdqa %xmm8, %xmm11
+; SSSE3-NEXT: pxor %xmm1, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13
+; SSSE3-NEXT: movdqa %xmm9, %xmm14
+; SSSE3-NEXT: pcmpgtd %xmm11, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm15
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3]
+; SSSE3-NEXT: por %xmm15, %xmm11
+; SSSE3-NEXT: pand %xmm11, %xmm8
+; SSSE3-NEXT: pandn %xmm6, %xmm11
+; SSSE3-NEXT: por %xmm8, %xmm11
+; SSSE3-NEXT: movdqa %xmm0, %xmm8
+; SSSE3-NEXT: pxor %xmm1, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm8, %xmm10
+; SSSE3-NEXT: pand %xmm10, %xmm0
+; SSSE3-NEXT: pandn %xmm6, %xmm10
+; SSSE3-NEXT: por %xmm0, %xmm10
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488]
+; SSSE3-NEXT: movdqa %xmm10, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm13
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pand %xmm13, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3]
+; SSSE3-NEXT: por %xmm14, %xmm13
+; SSSE3-NEXT: pand %xmm13, %xmm10
+; SSSE3-NEXT: pandn %xmm6, %xmm13
+; SSSE3-NEXT: por %xmm10, %xmm13
+; SSSE3-NEXT: movdqa %xmm11, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pand %xmm10, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: por %xmm14, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm11
+; SSSE3-NEXT: pandn %xmm6, %xmm0
+; SSSE3-NEXT: por %xmm11, %xmm0
+; SSSE3-NEXT: packssdw %xmm13, %xmm0
+; SSSE3-NEXT: movdqa %xmm12, %xmm10
+; SSSE3-NEXT: pxor %xmm1, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm11, %xmm13
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm13, %xmm10
+; SSSE3-NEXT: pand %xmm10, %xmm12
+; SSSE3-NEXT: pandn %xmm6, %xmm10
+; SSSE3-NEXT: por %xmm12, %xmm10
+; SSSE3-NEXT: movdqa %xmm7, %xmm11
+; SSSE3-NEXT: pxor %xmm1, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm12
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2]
+; SSSE3-NEXT: pand %xmm12, %xmm13
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSSE3-NEXT: por %xmm13, %xmm11
+; SSSE3-NEXT: pand %xmm11, %xmm7
+; SSSE3-NEXT: pandn %xmm6, %xmm11
+; SSSE3-NEXT: por %xmm7, %xmm11
+; SSSE3-NEXT: packssdw %xmm10, %xmm11
+; SSSE3-NEXT: packssdw %xmm11, %xmm0
+; SSSE3-NEXT: movdqa %xmm5, %xmm7
+; SSSE3-NEXT: pxor %xmm1, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pand %xmm10, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm5
+; SSSE3-NEXT: pandn %xmm6, %xmm7
+; SSSE3-NEXT: por %xmm5, %xmm7
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pand %xmm10, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm4
+; SSSE3-NEXT: pandn %xmm6, %xmm5
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: packssdw %xmm7, %xmm5
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm1, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pand %xmm7, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: pandn %xmm6, %xmm4
+; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pand %xmm3, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm1
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: pandn %xmm6, %xmm1
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: packssdw %xmm4, %xmm1
+; SSSE3-NEXT: packssdw %xmm1, %xmm5
+; SSSE3-NEXT: packsswb %xmm5, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v16i64_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm11
-; SSE41-NEXT: movdqa 16(%rdi), %xmm9
-; SSE41-NEXT: movdqa 32(%rdi), %xmm15
-; SSE41-NEXT: movdqa 48(%rdi), %xmm12
-; SSE41-NEXT: movdqa 80(%rdi), %xmm4
-; SSE41-NEXT: movdqa 64(%rdi), %xmm14
-; SSE41-NEXT: movdqa 112(%rdi), %xmm13
-; SSE41-NEXT: movdqa 96(%rdi), %xmm3
+; SSE41-NEXT: movdqa (%rdi), %xmm8
+; SSE41-NEXT: movdqa 16(%rdi), %xmm7
+; SSE41-NEXT: movdqa 32(%rdi), %xmm12
+; SSE41-NEXT: movdqa 48(%rdi), %xmm11
+; SSE41-NEXT: movdqa 80(%rdi), %xmm10
+; SSE41-NEXT: movdqa 64(%rdi), %xmm6
+; SSE41-NEXT: movdqa 112(%rdi), %xmm5
+; SSE41-NEXT: movdqa 96(%rdi), %xmm4
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127]
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775]
-; SSE41-NEXT: movdqa %xmm7, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm7, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8
-; SSE41-NEXT: movdqa %xmm13, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm7, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm10
-; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm10
-; SSE41-NEXT: movdqa %xmm14, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm7, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm13
-; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm13
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
+; SSE41-NEXT: movdqa %xmm9, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm7, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm14
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14
-; SSE41-NEXT: movdqa %xmm15, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm7, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5
+; SSE41-NEXT: movdqa %xmm10, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6
; SSE41-NEXT: movdqa %xmm12, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm7, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm15
-; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm15
+; SSE41-NEXT: movdqa %xmm9, %xmm10
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm10
+; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10
; SSE41-NEXT: movdqa %xmm11, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm7, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm6
-; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm12
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm12
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT: pand %xmm12, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm12
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12
+; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
+; SSE41-NEXT: movdqa %xmm9, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm11
+; SSE41-NEXT: movdqa %xmm9, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm11
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488]
-; SSE41-NEXT: movapd %xmm1, %xmm5
-; SSE41-NEXT: xorpd %xmm2, %xmm5
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
-; SSE41-NEXT: movapd %xmm5, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm6, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm9
+; SSE41-NEXT: xorpd %xmm2, %xmm9
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840]
+; SSE41-NEXT: movapd %xmm9, %xmm13
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm13, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm7, %xmm9
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9
+; SSE41-NEXT: movapd %xmm11, %xmm1
; SSE41-NEXT: xorpd %xmm2, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm13
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: pand %xmm13, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
-; SSE41-NEXT: packssdw %xmm3, %xmm1
-; SSE41-NEXT: movapd %xmm15, %xmm3
-; SSE41-NEXT: xorpd %xmm2, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1
+; SSE41-NEXT: packssdw %xmm9, %xmm1
+; SSE41-NEXT: movapd %xmm12, %xmm9
+; SSE41-NEXT: xorpd %xmm2, %xmm9
+; SSE41-NEXT: movapd %xmm9, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm11
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm7, %xmm9
+; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm9
+; SSE41-NEXT: movapd %xmm10, %xmm11
+; SSE41-NEXT: xorpd %xmm2, %xmm11
+; SSE41-NEXT: movapd %xmm11, %xmm12
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm12
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm11
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
+; SSE41-NEXT: pand %xmm12, %xmm0
+; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: movapd %xmm7, %xmm11
+; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11
+; SSE41-NEXT: packssdw %xmm9, %xmm11
+; SSE41-NEXT: packssdw %xmm11, %xmm1
+; SSE41-NEXT: movapd %xmm6, %xmm9
+; SSE41-NEXT: xorpd %xmm2, %xmm9
+; SSE41-NEXT: movapd %xmm9, %xmm10
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm7, %xmm9
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm9
+; SSE41-NEXT: movapd %xmm5, %xmm6
+; SSE41-NEXT: xorpd %xmm2, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm10
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: movapd %xmm7, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
+; SSE41-NEXT: packssdw %xmm9, %xmm6
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: xorpd %xmm2, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm5
+; SSE41-NEXT: movapd %xmm5, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: pand %xmm9, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
-; SSE41-NEXT: packssdw %xmm3, %xmm5
-; SSE41-NEXT: packssdw %xmm5, %xmm1
-; SSE41-NEXT: movapd %xmm14, %xmm3
-; SSE41-NEXT: xorpd %xmm2, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm3
-; SSE41-NEXT: movapd %xmm13, %xmm4
-; SSE41-NEXT: xorpd %xmm2, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm4
-; SSE41-NEXT: packssdw %xmm3, %xmm4
-; SSE41-NEXT: movapd %xmm10, %xmm3
-; SSE41-NEXT: xorpd %xmm2, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
-; SSE41-NEXT: xorpd %xmm8, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm2
+; SSE41-NEXT: xorpd %xmm3, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
-; SSE41-NEXT: packssdw %xmm3, %xmm7
-; SSE41-NEXT: packssdw %xmm7, %xmm4
-; SSE41-NEXT: packsswb %xmm4, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT: packssdw %xmm5, %xmm7
+; SSE41-NEXT: packssdw %xmm7, %xmm6
+; SSE41-NEXT: packsswb %xmm6, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -4728,10 +4728,10 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm8
+; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm9
+; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3
@@ -4741,39 +4741,39 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; AVX1-NEXT: vmovdqa (%rdi), %xmm5
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm7, %xmm2, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm5, %xmm2, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm7, %xmm2, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm8, %xmm2, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm2, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpackssdw %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm6, %xmm1
-; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm6, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm6, %xmm2
-; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm9, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm9, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm8, %xmm6, %xmm3
-; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm5
+; AVX1-NEXT: vblendvpd %xmm5, %xmm8, %xmm6, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm8
+; AVX1-NEXT: vblendvpd %xmm8, %xmm7, %xmm6, %xmm7
+; AVX1-NEXT: vpackssdw %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpackssdw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm5
+; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm5
+; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm4
+; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm4
+; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_ssat_v16i64_v16i8:
@@ -5624,82 +5624,82 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind {
; SSE2-NEXT: pand %xmm3, %xmm4
; SSE2-NEXT: pandn %xmm5, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: movd %xmm3, %edx
-; SSE2-NEXT: movw %dx, 36(%rdi)
-; SSE2-NEXT: movd %xmm2, %ecx
-; SSE2-NEXT: movw %cx, 24(%rdi)
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: movw %ax, 12(%rdi)
-; SSE2-NEXT: movd %xmm0, %r8d
-; SSE2-NEXT: movw %r8w, (%rdi)
+; SSE2-NEXT: movd %xmm3, %r8d
+; SSE2-NEXT: movw %r8w, 36(%rdi)
+; SSE2-NEXT: movd %xmm2, %r11d
+; SSE2-NEXT: movw %r11w, 24(%rdi)
+; SSE2-NEXT: movd %xmm1, %r14d
+; SSE2-NEXT: movw %r14w, 12(%rdi)
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3]
-; SSE2-NEXT: movd %xmm4, %r9d
-; SSE2-NEXT: movw %r9w, 45(%rdi)
+; SSE2-NEXT: movd %xmm4, %ecx
+; SSE2-NEXT: movw %cx, 45(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; SSE2-NEXT: movd %xmm4, %r10d
-; SSE2-NEXT: movw %r10w, 42(%rdi)
+; SSE2-NEXT: movd %xmm4, %edx
+; SSE2-NEXT: movw %dx, 42(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; SSE2-NEXT: movd %xmm3, %r11d
-; SSE2-NEXT: movw %r11w, 39(%rdi)
-; SSE2-NEXT: shrl $16, %edx
-; SSE2-NEXT: movb %dl, 38(%rdi)
+; SSE2-NEXT: movd %xmm3, %esi
+; SSE2-NEXT: movw %si, 39(%rdi)
+; SSE2-NEXT: shrl $16, %r8d
+; SSE2-NEXT: movb %r8b, 38(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3]
-; SSE2-NEXT: movd %xmm3, %r14d
-; SSE2-NEXT: movw %r14w, 33(%rdi)
+; SSE2-NEXT: movd %xmm3, %r8d
+; SSE2-NEXT: movw %r8w, 33(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; SSE2-NEXT: movd %xmm3, %r15d
-; SSE2-NEXT: movw %r15w, 30(%rdi)
+; SSE2-NEXT: movd %xmm3, %r9d
+; SSE2-NEXT: movw %r9w, 30(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; SSE2-NEXT: movd %xmm2, %r12d
-; SSE2-NEXT: movw %r12w, 27(%rdi)
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: movb %cl, 26(%rdi)
+; SSE2-NEXT: movd %xmm2, %r10d
+; SSE2-NEXT: movw %r10w, 27(%rdi)
+; SSE2-NEXT: shrl $16, %r11d
+; SSE2-NEXT: movb %r11b, 26(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; SSE2-NEXT: movd %xmm2, %esi
-; SSE2-NEXT: movw %si, 21(%rdi)
+; SSE2-NEXT: movd %xmm2, %r11d
+; SSE2-NEXT: movw %r11w, 21(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE2-NEXT: movd %xmm2, %ebx
; SSE2-NEXT: movw %bx, 18(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; SSE2-NEXT: movd %xmm1, %ebp
; SSE2-NEXT: movw %bp, 15(%rdi)
-; SSE2-NEXT: shrl $16, %eax
-; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: shrl $16, %r14d
+; SSE2-NEXT: movb %r14b, 14(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: movw %ax, 9(%rdi)
+; SSE2-NEXT: movd %xmm1, %r14d
+; SSE2-NEXT: movw %r14w, 9(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: movd %xmm1, %ecx
-; SSE2-NEXT: movw %cx, 6(%rdi)
+; SSE2-NEXT: movd %xmm1, %r15d
+; SSE2-NEXT: movw %r15w, 6(%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE2-NEXT: movd %xmm0, %edx
-; SSE2-NEXT: movw %dx, 3(%rdi)
+; SSE2-NEXT: movd %xmm0, %r12d
+; SSE2-NEXT: movw %r12w, 3(%rdi)
+; SSE2-NEXT: shrl $16, %eax
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: shrl $16, %ecx
+; SSE2-NEXT: movb %cl, 47(%rdi)
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 44(%rdi)
+; SSE2-NEXT: shrl $16, %esi
+; SSE2-NEXT: movb %sil, 41(%rdi)
; SSE2-NEXT: shrl $16, %r8d
-; SSE2-NEXT: movb %r8b, 2(%rdi)
+; SSE2-NEXT: movb %r8b, 35(%rdi)
; SSE2-NEXT: shrl $16, %r9d
-; SSE2-NEXT: movb %r9b, 47(%rdi)
+; SSE2-NEXT: movb %r9b, 32(%rdi)
; SSE2-NEXT: shrl $16, %r10d
-; SSE2-NEXT: movb %r10b, 44(%rdi)
+; SSE2-NEXT: movb %r10b, 29(%rdi)
; SSE2-NEXT: shrl $16, %r11d
-; SSE2-NEXT: movb %r11b, 41(%rdi)
-; SSE2-NEXT: shrl $16, %r14d
-; SSE2-NEXT: movb %r14b, 35(%rdi)
-; SSE2-NEXT: shrl $16, %r15d
-; SSE2-NEXT: movb %r15b, 32(%rdi)
-; SSE2-NEXT: shrl $16, %r12d
-; SSE2-NEXT: movb %r12b, 29(%rdi)
-; SSE2-NEXT: shrl $16, %esi
-; SSE2-NEXT: movb %sil, 23(%rdi)
+; SSE2-NEXT: movb %r11b, 23(%rdi)
; SSE2-NEXT: shrl $16, %ebx
; SSE2-NEXT: movb %bl, 20(%rdi)
; SSE2-NEXT: shrl $16, %ebp
; SSE2-NEXT: movb %bpl, 17(%rdi)
-; SSE2-NEXT: shrl $16, %eax
-; SSE2-NEXT: movb %al, 11(%rdi)
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: movb %cl, 8(%rdi)
-; SSE2-NEXT: shrl $16, %edx
-; SSE2-NEXT: movb %dl, 5(%rdi)
+; SSE2-NEXT: shrl $16, %r14d
+; SSE2-NEXT: movb %r14b, 11(%rdi)
+; SSE2-NEXT: shrl $16, %r15d
+; SSE2-NEXT: movb %r15b, 8(%rdi)
+; SSE2-NEXT: shrl $16, %r12d
+; SSE2-NEXT: movb %r12b, 5(%rdi)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r14
@@ -5756,82 +5756,82 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind {
; SSSE3-NEXT: pand %xmm3, %xmm4
; SSSE3-NEXT: pandn %xmm5, %xmm3
; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: movd %xmm3, %edx
-; SSSE3-NEXT: movw %dx, 36(%rdi)
-; SSSE3-NEXT: movd %xmm2, %ecx
-; SSSE3-NEXT: movw %cx, 24(%rdi)
-; SSSE3-NEXT: movd %xmm1, %eax
-; SSSE3-NEXT: movw %ax, 12(%rdi)
-; SSSE3-NEXT: movd %xmm0, %r8d
-; SSSE3-NEXT: movw %r8w, (%rdi)
+; SSSE3-NEXT: movd %xmm3, %r8d
+; SSSE3-NEXT: movw %r8w, 36(%rdi)
+; SSSE3-NEXT: movd %xmm2, %r11d
+; SSSE3-NEXT: movw %r11w, 24(%rdi)
+; SSSE3-NEXT: movd %xmm1, %r14d
+; SSSE3-NEXT: movw %r14w, 12(%rdi)
+; SSSE3-NEXT: movd %xmm0, %eax
+; SSSE3-NEXT: movw %ax, (%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3]
-; SSSE3-NEXT: movd %xmm4, %r9d
-; SSSE3-NEXT: movw %r9w, 45(%rdi)
+; SSSE3-NEXT: movd %xmm4, %ecx
+; SSSE3-NEXT: movw %cx, 45(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; SSSE3-NEXT: movd %xmm4, %r10d
-; SSSE3-NEXT: movw %r10w, 42(%rdi)
+; SSSE3-NEXT: movd %xmm4, %edx
+; SSSE3-NEXT: movw %dx, 42(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; SSSE3-NEXT: movd %xmm3, %r11d
-; SSSE3-NEXT: movw %r11w, 39(%rdi)
-; SSSE3-NEXT: shrl $16, %edx
-; SSSE3-NEXT: movb %dl, 38(%rdi)
+; SSSE3-NEXT: movd %xmm3, %esi
+; SSSE3-NEXT: movw %si, 39(%rdi)
+; SSSE3-NEXT: shrl $16, %r8d
+; SSSE3-NEXT: movb %r8b, 38(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3]
-; SSSE3-NEXT: movd %xmm3, %r14d
-; SSSE3-NEXT: movw %r14w, 33(%rdi)
+; SSSE3-NEXT: movd %xmm3, %r8d
+; SSSE3-NEXT: movw %r8w, 33(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; SSSE3-NEXT: movd %xmm3, %r15d
-; SSSE3-NEXT: movw %r15w, 30(%rdi)
+; SSSE3-NEXT: movd %xmm3, %r9d
+; SSSE3-NEXT: movw %r9w, 30(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; SSSE3-NEXT: movd %xmm2, %r12d
-; SSSE3-NEXT: movw %r12w, 27(%rdi)
-; SSSE3-NEXT: shrl $16, %ecx
-; SSSE3-NEXT: movb %cl, 26(%rdi)
+; SSSE3-NEXT: movd %xmm2, %r10d
+; SSSE3-NEXT: movw %r10w, 27(%rdi)
+; SSSE3-NEXT: shrl $16, %r11d
+; SSSE3-NEXT: movb %r11b, 26(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; SSSE3-NEXT: movd %xmm2, %esi
-; SSSE3-NEXT: movw %si, 21(%rdi)
+; SSSE3-NEXT: movd %xmm2, %r11d
+; SSSE3-NEXT: movw %r11w, 21(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSSE3-NEXT: movd %xmm2, %ebx
; SSSE3-NEXT: movw %bx, 18(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; SSSE3-NEXT: movd %xmm1, %ebp
; SSSE3-NEXT: movw %bp, 15(%rdi)
-; SSSE3-NEXT: shrl $16, %eax
-; SSSE3-NEXT: movb %al, 14(%rdi)
+; SSSE3-NEXT: shrl $16, %r14d
+; SSSE3-NEXT: movb %r14b, 14(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSSE3-NEXT: movd %xmm1, %eax
-; SSSE3-NEXT: movw %ax, 9(%rdi)
+; SSSE3-NEXT: movd %xmm1, %r14d
+; SSSE3-NEXT: movw %r14w, 9(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSSE3-NEXT: movd %xmm1, %ecx
-; SSSE3-NEXT: movw %cx, 6(%rdi)
+; SSSE3-NEXT: movd %xmm1, %r15d
+; SSSE3-NEXT: movw %r15w, 6(%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSSE3-NEXT: movd %xmm0, %edx
-; SSSE3-NEXT: movw %dx, 3(%rdi)
+; SSSE3-NEXT: movd %xmm0, %r12d
+; SSSE3-NEXT: movw %r12w, 3(%rdi)
+; SSSE3-NEXT: shrl $16, %eax
+; SSSE3-NEXT: movb %al, 2(%rdi)
+; SSSE3-NEXT: shrl $16, %ecx
+; SSSE3-NEXT: movb %cl, 47(%rdi)
+; SSSE3-NEXT: shrl $16, %edx
+; SSSE3-NEXT: movb %dl, 44(%rdi)
+; SSSE3-NEXT: shrl $16, %esi
+; SSSE3-NEXT: movb %sil, 41(%rdi)
; SSSE3-NEXT: shrl $16, %r8d
-; SSSE3-NEXT: movb %r8b, 2(%rdi)
+; SSSE3-NEXT: movb %r8b, 35(%rdi)
; SSSE3-NEXT: shrl $16, %r9d
-; SSSE3-NEXT: movb %r9b, 47(%rdi)
+; SSSE3-NEXT: movb %r9b, 32(%rdi)
; SSSE3-NEXT: shrl $16, %r10d
-; SSSE3-NEXT: movb %r10b, 44(%rdi)
+; SSSE3-NEXT: movb %r10b, 29(%rdi)
; SSSE3-NEXT: shrl $16, %r11d
-; SSSE3-NEXT: movb %r11b, 41(%rdi)
-; SSSE3-NEXT: shrl $16, %r14d
-; SSSE3-NEXT: movb %r14b, 35(%rdi)
-; SSSE3-NEXT: shrl $16, %r15d
-; SSSE3-NEXT: movb %r15b, 32(%rdi)
-; SSSE3-NEXT: shrl $16, %r12d
-; SSSE3-NEXT: movb %r12b, 29(%rdi)
-; SSSE3-NEXT: shrl $16, %esi
-; SSSE3-NEXT: movb %sil, 23(%rdi)
+; SSSE3-NEXT: movb %r11b, 23(%rdi)
; SSSE3-NEXT: shrl $16, %ebx
; SSSE3-NEXT: movb %bl, 20(%rdi)
; SSSE3-NEXT: shrl $16, %ebp
; SSSE3-NEXT: movb %bpl, 17(%rdi)
-; SSSE3-NEXT: shrl $16, %eax
-; SSSE3-NEXT: movb %al, 11(%rdi)
-; SSSE3-NEXT: shrl $16, %ecx
-; SSSE3-NEXT: movb %cl, 8(%rdi)
-; SSSE3-NEXT: shrl $16, %edx
-; SSSE3-NEXT: movb %dl, 5(%rdi)
+; SSSE3-NEXT: shrl $16, %r14d
+; SSSE3-NEXT: movb %r14b, 11(%rdi)
+; SSSE3-NEXT: shrl $16, %r15d
+; SSSE3-NEXT: movb %r15b, 8(%rdi)
+; SSSE3-NEXT: shrl $16, %r12d
+; SSSE3-NEXT: movb %r12b, 5(%rdi)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r14
@@ -6084,72 +6084,72 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind {
; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT: vpextrd $3, %xmm1, %ecx
-; AVX512-NEXT: movw %cx, 45(%rdi)
-; AVX512-NEXT: vpextrd $2, %xmm1, %eax
-; AVX512-NEXT: movw %ax, 42(%rdi)
+; AVX512-NEXT: vpextrd $3, %xmm1, %r15d
+; AVX512-NEXT: movw %r15w, 45(%rdi)
+; AVX512-NEXT: vpextrd $2, %xmm1, %r14d
+; AVX512-NEXT: movw %r14w, 42(%rdi)
; AVX512-NEXT: vpextrd $1, %xmm1, %ebp
; AVX512-NEXT: movw %bp, 39(%rdi)
-; AVX512-NEXT: vmovd %xmm1, %esi
-; AVX512-NEXT: movw %si, 36(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %r11d
+; AVX512-NEXT: movw %r11w, 36(%rdi)
; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; AVX512-NEXT: vpextrd $3, %xmm1, %ebx
; AVX512-NEXT: movw %bx, 33(%rdi)
-; AVX512-NEXT: vpextrd $2, %xmm1, %edx
-; AVX512-NEXT: movw %dx, 30(%rdi)
-; AVX512-NEXT: vpextrd $1, %xmm1, %r15d
-; AVX512-NEXT: movw %r15w, 27(%rdi)
-; AVX512-NEXT: vmovd %xmm1, %r14d
-; AVX512-NEXT: movw %r14w, 24(%rdi)
-; AVX512-NEXT: vpextrd $3, %xmm0, %r11d
-; AVX512-NEXT: movw %r11w, 9(%rdi)
-; AVX512-NEXT: vpextrd $2, %xmm0, %r10d
-; AVX512-NEXT: movw %r10w, 6(%rdi)
-; AVX512-NEXT: vpextrd $1, %xmm0, %r9d
-; AVX512-NEXT: movw %r9w, 3(%rdi)
-; AVX512-NEXT: vmovd %xmm0, %r8d
-; AVX512-NEXT: movw %r8w, (%rdi)
-; AVX512-NEXT: shrl $16, %ecx
-; AVX512-NEXT: movb %cl, 47(%rdi)
-; AVX512-NEXT: shrl $16, %eax
-; AVX512-NEXT: movb %al, 44(%rdi)
-; AVX512-NEXT: shrl $16, %ebp
-; AVX512-NEXT: movb %bpl, 41(%rdi)
-; AVX512-NEXT: shrl $16, %esi
-; AVX512-NEXT: movb %sil, 38(%rdi)
-; AVX512-NEXT: shrl $16, %ebx
-; AVX512-NEXT: movb %bl, 35(%rdi)
-; AVX512-NEXT: shrl $16, %edx
-; AVX512-NEXT: movb %dl, 32(%rdi)
-; AVX512-NEXT: shrl $16, %r15d
-; AVX512-NEXT: movb %r15b, 29(%rdi)
-; AVX512-NEXT: shrl $16, %r14d
-; AVX512-NEXT: movb %r14b, 26(%rdi)
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vpextrd $2, %xmm1, %r10d
+; AVX512-NEXT: movw %r10w, 30(%rdi)
+; AVX512-NEXT: vpextrd $1, %xmm1, %r9d
+; AVX512-NEXT: movw %r9w, 27(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %r8d
+; AVX512-NEXT: movw %r8w, 24(%rdi)
; AVX512-NEXT: vpextrd $3, %xmm0, %esi
-; AVX512-NEXT: movw %si, 21(%rdi)
+; AVX512-NEXT: movw %si, 9(%rdi)
; AVX512-NEXT: vpextrd $2, %xmm0, %edx
-; AVX512-NEXT: movw %dx, 18(%rdi)
+; AVX512-NEXT: movw %dx, 6(%rdi)
; AVX512-NEXT: vpextrd $1, %xmm0, %ecx
-; AVX512-NEXT: movw %cx, 15(%rdi)
+; AVX512-NEXT: movw %cx, 3(%rdi)
; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: movw %ax, 12(%rdi)
+; AVX512-NEXT: movw %ax, (%rdi)
+; AVX512-NEXT: shrl $16, %r15d
+; AVX512-NEXT: movb %r15b, 47(%rdi)
+; AVX512-NEXT: shrl $16, %r14d
+; AVX512-NEXT: movb %r14b, 44(%rdi)
+; AVX512-NEXT: shrl $16, %ebp
+; AVX512-NEXT: movb %bpl, 41(%rdi)
; AVX512-NEXT: shrl $16, %r11d
-; AVX512-NEXT: movb %r11b, 11(%rdi)
+; AVX512-NEXT: movb %r11b, 38(%rdi)
+; AVX512-NEXT: shrl $16, %ebx
+; AVX512-NEXT: movb %bl, 35(%rdi)
; AVX512-NEXT: shrl $16, %r10d
-; AVX512-NEXT: movb %r10b, 8(%rdi)
+; AVX512-NEXT: movb %r10b, 32(%rdi)
; AVX512-NEXT: shrl $16, %r9d
-; AVX512-NEXT: movb %r9b, 5(%rdi)
+; AVX512-NEXT: movb %r9b, 29(%rdi)
; AVX512-NEXT: shrl $16, %r8d
-; AVX512-NEXT: movb %r8b, 2(%rdi)
+; AVX512-NEXT: movb %r8b, 26(%rdi)
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vpextrd $3, %xmm0, %r11d
+; AVX512-NEXT: movw %r11w, 21(%rdi)
+; AVX512-NEXT: vpextrd $2, %xmm0, %r10d
+; AVX512-NEXT: movw %r10w, 18(%rdi)
+; AVX512-NEXT: vpextrd $1, %xmm0, %r9d
+; AVX512-NEXT: movw %r9w, 15(%rdi)
+; AVX512-NEXT: vmovd %xmm0, %r8d
+; AVX512-NEXT: movw %r8w, 12(%rdi)
; AVX512-NEXT: shrl $16, %esi
-; AVX512-NEXT: movb %sil, 23(%rdi)
+; AVX512-NEXT: movb %sil, 11(%rdi)
; AVX512-NEXT: shrl $16, %edx
-; AVX512-NEXT: movb %dl, 20(%rdi)
+; AVX512-NEXT: movb %dl, 8(%rdi)
; AVX512-NEXT: shrl $16, %ecx
-; AVX512-NEXT: movb %cl, 17(%rdi)
+; AVX512-NEXT: movb %cl, 5(%rdi)
; AVX512-NEXT: shrl $16, %eax
-; AVX512-NEXT: movb %al, 14(%rdi)
+; AVX512-NEXT: movb %al, 2(%rdi)
+; AVX512-NEXT: shrl $16, %r11d
+; AVX512-NEXT: movb %r11b, 23(%rdi)
+; AVX512-NEXT: shrl $16, %r10d
+; AVX512-NEXT: movb %r10b, 20(%rdi)
+; AVX512-NEXT: shrl $16, %r9d
+; AVX512-NEXT: movb %r9b, 17(%rdi)
+; AVX512-NEXT: shrl $16, %r8d
+; AVX512-NEXT: movb %r8b, 14(%rdi)
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r14
; AVX512-NEXT: popq %r15
@@ -6166,72 +6166,72 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind {
; SKX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; SKX-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; SKX-NEXT: vpextrd $3, %xmm1, %ecx
-; SKX-NEXT: movw %cx, 45(%rdi)
-; SKX-NEXT: vpextrd $2, %xmm1, %eax
-; SKX-NEXT: movw %ax, 42(%rdi)
+; SKX-NEXT: vpextrd $3, %xmm1, %r15d
+; SKX-NEXT: movw %r15w, 45(%rdi)
+; SKX-NEXT: vpextrd $2, %xmm1, %r14d
+; SKX-NEXT: movw %r14w, 42(%rdi)
; SKX-NEXT: vpextrd $1, %xmm1, %ebp
; SKX-NEXT: movw %bp, 39(%rdi)
-; SKX-NEXT: vmovd %xmm1, %esi
-; SKX-NEXT: movw %si, 36(%rdi)
+; SKX-NEXT: vmovd %xmm1, %r11d
+; SKX-NEXT: movw %r11w, 36(%rdi)
; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; SKX-NEXT: vpextrd $3, %xmm1, %ebx
; SKX-NEXT: movw %bx, 33(%rdi)
-; SKX-NEXT: vpextrd $2, %xmm1, %edx
-; SKX-NEXT: movw %dx, 30(%rdi)
-; SKX-NEXT: vpextrd $1, %xmm1, %r15d
-; SKX-NEXT: movw %r15w, 27(%rdi)
-; SKX-NEXT: vmovd %xmm1, %r14d
-; SKX-NEXT: vpextrd $3, %xmm0, %r11d
-; SKX-NEXT: movw %r14w, 24(%rdi)
-; SKX-NEXT: movw %r11w, 9(%rdi)
-; SKX-NEXT: vpextrd $2, %xmm0, %r10d
-; SKX-NEXT: vpextrd $1, %xmm0, %r9d
-; SKX-NEXT: movw %r10w, 6(%rdi)
-; SKX-NEXT: movw %r9w, 3(%rdi)
-; SKX-NEXT: vmovd %xmm0, %r8d
-; SKX-NEXT: movw %r8w, (%rdi)
-; SKX-NEXT: shrl $16, %ecx
-; SKX-NEXT: movb %cl, 47(%rdi)
-; SKX-NEXT: shrl $16, %eax
-; SKX-NEXT: movb %al, 44(%rdi)
-; SKX-NEXT: shrl $16, %ebp
-; SKX-NEXT: movb %bpl, 41(%rdi)
-; SKX-NEXT: shrl $16, %esi
-; SKX-NEXT: movb %sil, 38(%rdi)
-; SKX-NEXT: shrl $16, %ebx
-; SKX-NEXT: movb %bl, 35(%rdi)
-; SKX-NEXT: shrl $16, %edx
-; SKX-NEXT: movb %dl, 32(%rdi)
-; SKX-NEXT: shrl $16, %r15d
-; SKX-NEXT: movb %r15b, 29(%rdi)
-; SKX-NEXT: shrl $16, %r14d
-; SKX-NEXT: movb %r14b, 26(%rdi)
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT: vpextrd $2, %xmm1, %r10d
+; SKX-NEXT: movw %r10w, 30(%rdi)
+; SKX-NEXT: vpextrd $1, %xmm1, %r9d
+; SKX-NEXT: movw %r9w, 27(%rdi)
+; SKX-NEXT: vmovd %xmm1, %r8d
; SKX-NEXT: vpextrd $3, %xmm0, %esi
-; SKX-NEXT: movw %si, 21(%rdi)
+; SKX-NEXT: movw %r8w, 24(%rdi)
+; SKX-NEXT: movw %si, 9(%rdi)
; SKX-NEXT: vpextrd $2, %xmm0, %edx
-; SKX-NEXT: movw %dx, 18(%rdi)
; SKX-NEXT: vpextrd $1, %xmm0, %ecx
-; SKX-NEXT: movw %cx, 15(%rdi)
+; SKX-NEXT: movw %dx, 6(%rdi)
+; SKX-NEXT: movw %cx, 3(%rdi)
; SKX-NEXT: vmovd %xmm0, %eax
-; SKX-NEXT: movw %ax, 12(%rdi)
+; SKX-NEXT: movw %ax, (%rdi)
+; SKX-NEXT: shrl $16, %r15d
+; SKX-NEXT: movb %r15b, 47(%rdi)
+; SKX-NEXT: shrl $16, %r14d
+; SKX-NEXT: movb %r14b, 44(%rdi)
+; SKX-NEXT: shrl $16, %ebp
+; SKX-NEXT: movb %bpl, 41(%rdi)
; SKX-NEXT: shrl $16, %r11d
-; SKX-NEXT: movb %r11b, 11(%rdi)
+; SKX-NEXT: movb %r11b, 38(%rdi)
+; SKX-NEXT: shrl $16, %ebx
+; SKX-NEXT: movb %bl, 35(%rdi)
; SKX-NEXT: shrl $16, %r10d
-; SKX-NEXT: movb %r10b, 8(%rdi)
+; SKX-NEXT: movb %r10b, 32(%rdi)
; SKX-NEXT: shrl $16, %r9d
-; SKX-NEXT: movb %r9b, 5(%rdi)
+; SKX-NEXT: movb %r9b, 29(%rdi)
; SKX-NEXT: shrl $16, %r8d
-; SKX-NEXT: movb %r8b, 2(%rdi)
+; SKX-NEXT: movb %r8b, 26(%rdi)
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT: vpextrd $3, %xmm0, %r11d
+; SKX-NEXT: movw %r11w, 21(%rdi)
+; SKX-NEXT: vpextrd $2, %xmm0, %r10d
+; SKX-NEXT: movw %r10w, 18(%rdi)
+; SKX-NEXT: vpextrd $1, %xmm0, %r9d
+; SKX-NEXT: movw %r9w, 15(%rdi)
+; SKX-NEXT: vmovd %xmm0, %r8d
+; SKX-NEXT: movw %r8w, 12(%rdi)
; SKX-NEXT: shrl $16, %esi
-; SKX-NEXT: movb %sil, 23(%rdi)
+; SKX-NEXT: movb %sil, 11(%rdi)
; SKX-NEXT: shrl $16, %edx
-; SKX-NEXT: movb %dl, 20(%rdi)
+; SKX-NEXT: movb %dl, 8(%rdi)
; SKX-NEXT: shrl $16, %ecx
-; SKX-NEXT: movb %cl, 17(%rdi)
+; SKX-NEXT: movb %cl, 5(%rdi)
; SKX-NEXT: shrl $16, %eax
-; SKX-NEXT: movb %al, 14(%rdi)
+; SKX-NEXT: movb %al, 2(%rdi)
+; SKX-NEXT: shrl $16, %r11d
+; SKX-NEXT: movb %r11b, 23(%rdi)
+; SKX-NEXT: shrl $16, %r10d
+; SKX-NEXT: movb %r10b, 20(%rdi)
+; SKX-NEXT: shrl $16, %r9d
+; SKX-NEXT: movb %r9b, 17(%rdi)
+; SKX-NEXT: shrl $16, %r8d
+; SKX-NEXT: movb %r8b, 14(%rdi)
; SKX-NEXT: popq %rbx
; SKX-NEXT: popq %r14
; SKX-NEXT: popq %r15
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 7710186c8850b..f5ccbdf843a91 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -433,184 +433,184 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) {
; SSE2-LABEL: trunc_usat_v8i64_v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm9
+; SSE2-NEXT: movdqa (%rdi), %xmm2
; SSE2-NEXT: movdqa 16(%rdi), %xmm5
; SSE2-NEXT: movdqa 32(%rdi), %xmm6
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: pxor %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455]
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm8
+; SSE2-NEXT: por %xmm1, %xmm8
; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm1
; SSE2-NEXT: por %xmm6, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
-; SSE2-NEXT: movdqa %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm9
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm9, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v8i64_v8i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa (%rdi), %xmm9
+; SSSE3-NEXT: movdqa (%rdi), %xmm2
; SSSE3-NEXT: movdqa 16(%rdi), %xmm5
; SSSE3-NEXT: movdqa 32(%rdi), %xmm6
; SSSE3-NEXT: movdqa 48(%rdi), %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: movdqa %xmm1, %xmm7
-; SSSE3-NEXT: pxor %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455]
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
+; SSSE3-NEXT: pxor %xmm0, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
+; SSSE3-NEXT: movdqa %xmm4, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm8
+; SSSE3-NEXT: por %xmm1, %xmm8
; SSSE3-NEXT: movdqa %xmm6, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm7
+; SSSE3-NEXT: movdqa %xmm4, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pand %xmm7, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm1
; SSSE3-NEXT: por %xmm6, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
-; SSSE3-NEXT: movdqa %xmm5, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm0, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm7
+; SSSE3-NEXT: movdqa %xmm4, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm7, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm5
+; SSSE3-NEXT: pandn %xmm3, %xmm7
+; SSSE3-NEXT: por %xmm5, %xmm7
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm0, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pand %xmm6, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm5, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm9
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm9, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pandn %xmm3, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v8i64_v8i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm8
-; SSE41-NEXT: movdqa 16(%rdi), %xmm9
+; SSE41-NEXT: movdqa (%rdi), %xmm3
+; SSE41-NEXT: movdqa 16(%rdi), %xmm6
; SSE41-NEXT: movdqa 32(%rdi), %xmm7
; SSE41-NEXT: movdqa 48(%rdi), %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295]
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
+; SSE41-NEXT: movdqa %xmm4, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm4, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm4, %xmm1
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm4, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm2, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
-; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
+; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4
-; SSE41-NEXT: pxor %xmm8, %xmm5
-; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: movdqa %xmm4, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: movdqa %xmm4, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
+; SSE41-NEXT: pxor %xmm3, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -1329,66 +1329,66 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) {
; SSE2-LABEL: trunc_usat_v8i64_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm4
-; SSE2-NEXT: movdqa 16(%rdi), %xmm9
+; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movdqa 32(%rdi), %xmm6
; SSE2-NEXT: movdqa 48(%rdi), %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002324991,9223372039002324991]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991]
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm7, %xmm6
+; SSE2-NEXT: pxor %xmm3, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm8
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
; SSE2-NEXT: por %xmm7, %xmm6
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm9
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: pxor %xmm3, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm8
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm8
+; SSE2-NEXT: por %xmm4, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
@@ -1402,66 +1402,66 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) {
; SSSE3-LABEL: trunc_usat_v8i64_v8i16:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa (%rdi), %xmm4
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm9
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
; SSSE3-NEXT: movdqa 32(%rdi), %xmm6
; SSSE3-NEXT: movdqa 48(%rdi), %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: movdqa %xmm6, %xmm2
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002324991,9223372039002324991]
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991]
+; SSSE3-NEXT: movdqa %xmm5, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm2
+; SSSE3-NEXT: pandn %xmm1, %xmm2
; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm7, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm6
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pxor %xmm3, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm8
+; SSSE3-NEXT: movdqa %xmm5, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm6
; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm6
+; SSSE3-NEXT: pandn %xmm1, %xmm6
; SSSE3-NEXT: por %xmm7, %xmm6
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm9
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: movdqa %xmm4, %xmm7
+; SSSE3-NEXT: pxor %xmm3, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm8
+; SSSE3-NEXT: movdqa %xmm5, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm4
+; SSSE3-NEXT: pandn %xmm1, %xmm8
+; SSSE3-NEXT: por %xmm4, %xmm8
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm3, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pand %xmm7, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pandn %xmm1, %xmm4
+; SSSE3-NEXT: por %xmm0, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
@@ -1476,54 +1476,54 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm7
; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: movdqa 32(%rdi), %xmm8
-; SSE41-NEXT: movdqa 48(%rdi), %xmm9
+; SSE41-NEXT: movdqa 32(%rdi), %xmm3
+; SSE41-NEXT: movdqa 48(%rdi), %xmm6
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535]
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991]
+; SSE41-NEXT: movdqa %xmm4, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm4, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm4, %xmm1
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm4, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm2, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: packusdw %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: packusdw %xmm8, %xmm1
+; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4
-; SSE41-NEXT: pxor %xmm8, %xmm5
-; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: movdqa %xmm4, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: movdqa %xmm4, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
+; SSE41-NEXT: pxor %xmm3, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
-; SSE41-NEXT: packusdw %xmm4, %xmm2
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: packusdw %xmm7, %xmm2
; SSE41-NEXT: packusdw %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -1882,9 +1882,9 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) {
; SSE2-LABEL: trunc_usat_v16i32_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm5
-; SSE2-NEXT: movdqa 16(%rdi), %xmm8
+; SSE2-NEXT: movdqa 16(%rdi), %xmm4
; SSE2-NEXT: movdqa 32(%rdi), %xmm0
-; SSE2-NEXT: movdqa 48(%rdi), %xmm4
+; SSE2-NEXT: movdqa 48(%rdi), %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm6, %xmm3
@@ -1895,24 +1895,24 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) {
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm7, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm8, %xmm0
; SSE2-NEXT: pxor %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm8
; SSE2-NEXT: pxor %xmm7, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm4
+; SSE2-NEXT: por %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm5, %xmm8
+; SSE2-NEXT: pxor %xmm6, %xmm8
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm5
; SSE2-NEXT: pxor %xmm7, %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
; SSE2-NEXT: pxor %xmm2, %xmm7
-; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm2
; SSE2-NEXT: por %xmm7, %xmm2
; SSE2-NEXT: pslld $16, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
@@ -1929,9 +1929,9 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) {
; SSSE3-LABEL: trunc_usat_v16i32_v16i16:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa (%rdi), %xmm5
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm8
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm4
; SSSE3-NEXT: movdqa 32(%rdi), %xmm0
-; SSSE3-NEXT: movdqa 48(%rdi), %xmm4
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm8
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm6, %xmm3
@@ -1942,24 +1942,24 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) {
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm7, %xmm1
; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm8, %xmm0
; SSSE3-NEXT: pxor %xmm6, %xmm0
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm4
+; SSSE3-NEXT: pand %xmm3, %xmm8
; SSSE3-NEXT: pxor %xmm7, %xmm3
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm6, %xmm4
+; SSSE3-NEXT: por %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm5, %xmm8
+; SSSE3-NEXT: pxor %xmm6, %xmm8
; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm5
; SSSE3-NEXT: pxor %xmm7, %xmm0
; SSSE3-NEXT: por %xmm5, %xmm0
-; SSSE3-NEXT: pxor %xmm8, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
; SSSE3-NEXT: pxor %xmm2, %xmm7
-; SSSE3-NEXT: pand %xmm8, %xmm2
+; SSSE3-NEXT: pand %xmm4, %xmm2
; SSSE3-NEXT: por %xmm7, %xmm2
; SSSE3-NEXT: pslld $16, %xmm2
; SSSE3-NEXT: psrad $16, %xmm2
@@ -2422,7 +2422,7 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE2-LABEL: trunc_usat_v4i64_v4i8_store:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm3
@@ -2431,28 +2431,28 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711]
; SSE2-NEXT: movdqa %xmm6, %xmm7
; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: por %xmm8, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm2, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: pand %xmm8, %xmm3
-; SSE2-NEXT: packuswb %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: packuswb %xmm4, %xmm3
; SSE2-NEXT: packuswb %xmm3, %xmm3
; SSE2-NEXT: packuswb %xmm3, %xmm3
; SSE2-NEXT: movd %xmm3, (%rdi)
@@ -2460,7 +2460,7 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
;
; SSSE3-LABEL: trunc_usat_v4i64_v4i8_store:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm4, %xmm3
@@ -2469,29 +2469,29 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711]
; SSSE3-NEXT: movdqa %xmm6, %xmm7
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pand %xmm5, %xmm8
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: por %xmm8, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm2, %xmm3
; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm4
+; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm0, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm4
; SSSE3-NEXT: pshufb %xmm0, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSSE3-NEXT: movd %xmm3, (%rdi)
; SSSE3-NEXT: retq
;
@@ -2608,65 +2608,65 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm6
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
-; SSE2-NEXT: movdqa 32(%rdi), %xmm9
+; SSE2-NEXT: movdqa 32(%rdi), %xmm1
; SSE2-NEXT: movdqa 48(%rdi), %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pxor %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm8
+; SSE2-NEXT: por %xmm0, %xmm8
; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm0
; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm9
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm9, %xmm1
-; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm3, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: packuswb %xmm7, %xmm4
+; SSE2-NEXT: packuswb %xmm4, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
@@ -2674,65 +2674,65 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) {
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa (%rdi), %xmm6
; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
-; SSSE3-NEXT: movdqa 32(%rdi), %xmm9
+; SSSE3-NEXT: movdqa 32(%rdi), %xmm1
; SSSE3-NEXT: movdqa 48(%rdi), %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: movdqa %xmm0, %xmm7
-; SSSE3-NEXT: pxor %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259711,9223372039002259711]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
+; SSSE3-NEXT: movdqa %xmm4, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: pandn %xmm2, %xmm8
+; SSSE3-NEXT: por %xmm0, %xmm8
; SSSE3-NEXT: movdqa %xmm6, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4
-; SSSE3-NEXT: movdqa %xmm1, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7
+; SSSE3-NEXT: movdqa %xmm4, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pand %xmm7, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm0
+; SSSE3-NEXT: pandn %xmm2, %xmm0
; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm2
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm2
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm9
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm9, %xmm1
-; SSSE3-NEXT: packuswb %xmm3, %xmm1
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
+; SSSE3-NEXT: packuswb %xmm8, %xmm0
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm3, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7
+; SSSE3-NEXT: movdqa %xmm4, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm7, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm5
+; SSSE3-NEXT: pandn %xmm2, %xmm7
+; SSSE3-NEXT: por %xmm5, %xmm7
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pxor %xmm3, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm4
+; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: packuswb %xmm7, %xmm4
+; SSSE3-NEXT: packuswb %xmm4, %xmm0
; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
@@ -2740,54 +2740,54 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm7
; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: movdqa 32(%rdi), %xmm8
-; SSE41-NEXT: movdqa 48(%rdi), %xmm9
+; SSE41-NEXT: movdqa 32(%rdi), %xmm3
+; SSE41-NEXT: movdqa 48(%rdi), %xmm6
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255]
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
+; SSE41-NEXT: movdqa %xmm4, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm4, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm4, %xmm1
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm4, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm2, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: packusdw %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: packusdw %xmm8, %xmm1
+; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4
-; SSE41-NEXT: pxor %xmm8, %xmm5
-; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: movdqa %xmm4, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: movdqa %xmm4, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
+; SSE41-NEXT: pxor %xmm3, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
-; SSE41-NEXT: packusdw %xmm4, %xmm2
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: packusdw %xmm7, %xmm2
; SSE41-NEXT: packusdw %xmm2, %xmm1
; SSE41-NEXT: packuswb %xmm1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -2866,65 +2866,65 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm6
; SSE2-NEXT: movdqa 16(%rdi), %xmm5
-; SSE2-NEXT: movdqa 32(%rdi), %xmm9
+; SSE2-NEXT: movdqa 32(%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm5, %xmm7
; SSE2-NEXT: pxor %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
-; SSE2-NEXT: pand %xmm1, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm6, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT: movdqa %xmm3, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm8
+; SSE2-NEXT: por %xmm5, %xmm8
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: pxor %xmm2, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm7
+; SSE2-NEXT: movdqa %xmm3, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm5
; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: packuswb %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm9
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm9, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm5
+; SSE2-NEXT: packuswb %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm7
+; SSE2-NEXT: movdqa %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
+; SSE2-NEXT: pand %xmm7, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm7
+; SSE2-NEXT: por %xmm4, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: packuswb %xmm7, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm5
; SSE2-NEXT: packuswb %xmm5, %xmm5
; SSE2-NEXT: movq %xmm5, (%rsi)
; SSE2-NEXT: retq
@@ -2933,65 +2933,65 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) {
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa (%rdi), %xmm6
; SSSE3-NEXT: movdqa 16(%rdi), %xmm5
-; SSSE3-NEXT: movdqa 32(%rdi), %xmm9
+; SSSE3-NEXT: movdqa 32(%rdi), %xmm0
; SSSE3-NEXT: movdqa 48(%rdi), %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: movdqa %xmm5, %xmm7
; SSSE3-NEXT: pxor %xmm2, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259711,9223372039002259711]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pand %xmm1, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm6, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm0, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711]
+; SSSE3-NEXT: movdqa %xmm3, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pandn %xmm1, %xmm8
+; SSSE3-NEXT: por %xmm5, %xmm8
+; SSSE3-NEXT: movdqa %xmm6, %xmm5
+; SSSE3-NEXT: pxor %xmm2, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7
+; SSSE3-NEXT: movdqa %xmm3, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pand %xmm7, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm5
+; SSSE3-NEXT: pandn %xmm1, %xmm5
; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: packuswb %xmm1, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm9
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm9, %xmm0
-; SSSE3-NEXT: packuswb %xmm3, %xmm0
-; SSSE3-NEXT: packuswb %xmm0, %xmm5
+; SSSE3-NEXT: packuswb %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm6
+; SSSE3-NEXT: pxor %xmm2, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7
+; SSSE3-NEXT: movdqa %xmm3, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2]
+; SSSE3-NEXT: pand %xmm7, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm4
+; SSSE3-NEXT: pandn %xmm1, %xmm7
+; SSSE3-NEXT: por %xmm4, %xmm7
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pand %xmm6, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pandn %xmm1, %xmm3
+; SSSE3-NEXT: por %xmm0, %xmm3
+; SSSE3-NEXT: packuswb %xmm7, %xmm3
+; SSSE3-NEXT: packuswb %xmm3, %xmm5
; SSSE3-NEXT: packuswb %xmm5, %xmm5
; SSSE3-NEXT: movq %xmm5, (%rsi)
; SSSE3-NEXT: retq
@@ -3000,54 +3000,54 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm7
; SSE41-NEXT: movdqa 16(%rdi), %xmm6
-; SSE41-NEXT: movdqa 32(%rdi), %xmm8
-; SSE41-NEXT: movdqa 48(%rdi), %xmm9
+; SSE41-NEXT: movdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movdqa 48(%rdi), %xmm5
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255]
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711]
+; SSE41-NEXT: movdqa %xmm3, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm3, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8
; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT: movdqa %xmm3, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6
-; SSE41-NEXT: packusdw %xmm3, %xmm6
-; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: packusdw %xmm8, %xmm6
+; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3
-; SSE41-NEXT: pxor %xmm8, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: movdqa %xmm3, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: movdqa %xmm3, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
+; SSE41-NEXT: pxor %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm3, %xmm5
; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
-; SSE41-NEXT: packusdw %xmm3, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: packusdw %xmm7, %xmm1
; SSE41-NEXT: packusdw %xmm1, %xmm6
; SSE41-NEXT: packuswb %xmm6, %xmm6
; SSE41-NEXT: movq %xmm6, (%rsi)
@@ -3127,359 +3127,359 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) {
define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) {
; SSE2-LABEL: trunc_usat_v16i64_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa 96(%rdi), %xmm9
-; SSE2-NEXT: movdqa 112(%rdi), %xmm10
-; SSE2-NEXT: movdqa 64(%rdi), %xmm11
-; SSE2-NEXT: movdqa 80(%rdi), %xmm12
-; SSE2-NEXT: movdqa (%rdi), %xmm3
-; SSE2-NEXT: movdqa 16(%rdi), %xmm6
-; SSE2-NEXT: movdqa 32(%rdi), %xmm13
-; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pxor %xmm14, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm14, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm6
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm5, %xmm0
+; SSE2-NEXT: movdqa 96(%rdi), %xmm1
+; SSE2-NEXT: movdqa 112(%rdi), %xmm3
+; SSE2-NEXT: movdqa 64(%rdi), %xmm6
+; SSE2-NEXT: movdqa 80(%rdi), %xmm7
+; SSE2-NEXT: movdqa (%rdi), %xmm10
+; SSE2-NEXT: movdqa 16(%rdi), %xmm0
+; SSE2-NEXT: movdqa 32(%rdi), %xmm8
+; SSE2-NEXT: movdqa 48(%rdi), %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm0, %xmm11
+; SSE2-NEXT: pxor %xmm4, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm12
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT: movdqa %xmm5, %xmm13
+; SSE2-NEXT: pcmpgtd %xmm11, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,0,2,2]
+; SSE2-NEXT: pand %xmm12, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm12
+; SSE2-NEXT: por %xmm0, %xmm12
+; SSE2-NEXT: movdqa %xmm10, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm11
+; SSE2-NEXT: movdqa %xmm5, %xmm13
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,3,3]
+; SSE2-NEXT: por %xmm14, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm10
+; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: por %xmm10, %xmm0
+; SSE2-NEXT: packuswb %xmm12, %xmm0
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pxor %xmm4, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm11
+; SSE2-NEXT: movdqa %xmm5, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm11, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm10, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm9
+; SSE2-NEXT: pandn %xmm2, %xmm11
+; SSE2-NEXT: por %xmm9, %xmm11
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: pxor %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: pandn %xmm2, %xmm10
+; SSE2-NEXT: por %xmm8, %xmm10
+; SSE2-NEXT: packuswb %xmm11, %xmm10
+; SSE2-NEXT: packuswb %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pxor %xmm4, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm9, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm8, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm7
+; SSE2-NEXT: pandn %xmm2, %xmm9
+; SSE2-NEXT: por %xmm7, %xmm9
+; SSE2-NEXT: movdqa %xmm6, %xmm7
+; SSE2-NEXT: pxor %xmm4, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm8
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm11, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: packuswb %xmm9, %xmm7
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm8
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm2, %xmm8
+; SSE2-NEXT: por %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm14, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm6
; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm3, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm13, %xmm1
-; SSE2-NEXT: pxor %xmm14, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm13
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm13, %xmm3
-; SSE2-NEXT: packuswb %xmm4, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm12, %xmm1
-; SSE2-NEXT: pxor %xmm14, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm12
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm12, %xmm3
-; SSE2-NEXT: movdqa %xmm11, %xmm1
-; SSE2-NEXT: pxor %xmm14, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm11
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm11, %xmm1
-; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm10, %xmm3
-; SSE2-NEXT: pxor %xmm14, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm10
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm10, %xmm4
-; SSE2-NEXT: movdqa %xmm9, %xmm3
-; SSE2-NEXT: pxor %xmm14, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm14, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm9
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm9, %xmm2
-; SSE2-NEXT: packuswb %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm8, %xmm4
+; SSE2-NEXT: packuswb %xmm4, %xmm7
+; SSE2-NEXT: packuswb %xmm7, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v16i64_v16i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa 96(%rdi), %xmm9
-; SSSE3-NEXT: movdqa 112(%rdi), %xmm10
-; SSSE3-NEXT: movdqa 64(%rdi), %xmm11
-; SSSE3-NEXT: movdqa 80(%rdi), %xmm12
-; SSSE3-NEXT: movdqa (%rdi), %xmm3
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm6
-; SSSE3-NEXT: movdqa 32(%rdi), %xmm13
-; SSSE3-NEXT: movdqa 48(%rdi), %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm14 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm6, %xmm0
-; SSSE3-NEXT: pxor %xmm14, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm14, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711]
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm14, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm14, %xmm6
-; SSSE3-NEXT: movdqa %xmm2, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: packuswb %xmm5, %xmm0
+; SSSE3-NEXT: movdqa 96(%rdi), %xmm1
+; SSSE3-NEXT: movdqa 112(%rdi), %xmm3
+; SSSE3-NEXT: movdqa 64(%rdi), %xmm6
+; SSSE3-NEXT: movdqa 80(%rdi), %xmm7
+; SSSE3-NEXT: movdqa (%rdi), %xmm10
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
+; SSSE3-NEXT: movdqa 32(%rdi), %xmm8
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa %xmm0, %xmm11
+; SSSE3-NEXT: pxor %xmm4, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm12
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711]
+; SSSE3-NEXT: movdqa %xmm5, %xmm13
+; SSSE3-NEXT: pcmpgtd %xmm11, %xmm13
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,0,2,2]
+; SSSE3-NEXT: pand %xmm12, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm12
+; SSSE3-NEXT: pand %xmm12, %xmm0
+; SSSE3-NEXT: pandn %xmm2, %xmm12
+; SSSE3-NEXT: por %xmm0, %xmm12
+; SSSE3-NEXT: movdqa %xmm10, %xmm0
+; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm11
+; SSSE3-NEXT: movdqa %xmm5, %xmm13
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm13
+; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSSE3-NEXT: pand %xmm11, %xmm14
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,3,3]
+; SSSE3-NEXT: por %xmm14, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm10
+; SSSE3-NEXT: pandn %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm10, %xmm0
+; SSSE3-NEXT: packuswb %xmm12, %xmm0
+; SSSE3-NEXT: movdqa %xmm9, %xmm10
+; SSSE3-NEXT: pxor %xmm4, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm11
+; SSSE3-NEXT: movdqa %xmm5, %xmm12
+; SSSE3-NEXT: pcmpgtd %xmm10, %xmm12
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2]
+; SSSE3-NEXT: pand %xmm11, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3]
+; SSSE3-NEXT: por %xmm10, %xmm11
+; SSSE3-NEXT: pand %xmm11, %xmm9
+; SSSE3-NEXT: pandn %xmm2, %xmm11
+; SSSE3-NEXT: por %xmm9, %xmm11
+; SSSE3-NEXT: movdqa %xmm8, %xmm9
+; SSSE3-NEXT: pxor %xmm4, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm10
+; SSSE3-NEXT: movdqa %xmm5, %xmm12
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[0,0,2,2]
+; SSSE3-NEXT: pand %xmm10, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSSE3-NEXT: por %xmm9, %xmm10
+; SSSE3-NEXT: pand %xmm10, %xmm8
+; SSSE3-NEXT: pandn %xmm2, %xmm10
+; SSSE3-NEXT: por %xmm8, %xmm10
+; SSSE3-NEXT: packuswb %xmm11, %xmm10
+; SSSE3-NEXT: packuswb %xmm10, %xmm0
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pxor %xmm4, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm9
+; SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm9, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm8, %xmm9
+; SSSE3-NEXT: pand %xmm9, %xmm7
+; SSSE3-NEXT: pandn %xmm2, %xmm9
+; SSSE3-NEXT: por %xmm7, %xmm9
+; SSSE3-NEXT: movdqa %xmm6, %xmm7
+; SSSE3-NEXT: pxor %xmm4, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm8
+; SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm10
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,3,3]
+; SSSE3-NEXT: por %xmm11, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm6
+; SSSE3-NEXT: pandn %xmm2, %xmm7
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: packuswb %xmm9, %xmm7
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm8
+; SSSE3-NEXT: movdqa %xmm5, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2]
+; SSSE3-NEXT: pand %xmm8, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm3
+; SSSE3-NEXT: pandn %xmm2, %xmm8
+; SSSE3-NEXT: por %xmm3, %xmm8
; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pxor %xmm14, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm14, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: pand %xmm6, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSSE3-NEXT: por %xmm3, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
+; SSSE3-NEXT: pandn %xmm2, %xmm4
; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm13, %xmm1
-; SSSE3-NEXT: pxor %xmm14, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm14, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm13
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm13, %xmm3
-; SSSE3-NEXT: packuswb %xmm4, %xmm3
-; SSSE3-NEXT: packuswb %xmm3, %xmm0
-; SSSE3-NEXT: movdqa %xmm12, %xmm1
-; SSSE3-NEXT: pxor %xmm14, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm14, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm12
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm12, %xmm3
-; SSSE3-NEXT: movdqa %xmm11, %xmm1
-; SSSE3-NEXT: pxor %xmm14, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm14, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm11
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm11, %xmm1
-; SSSE3-NEXT: packuswb %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm10, %xmm3
-; SSSE3-NEXT: pxor %xmm14, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm14, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm10
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm10, %xmm4
-; SSSE3-NEXT: movdqa %xmm9, %xmm3
-; SSSE3-NEXT: pxor %xmm14, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm14, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm9
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm9, %xmm2
-; SSSE3-NEXT: packuswb %xmm4, %xmm2
-; SSSE3-NEXT: packuswb %xmm2, %xmm1
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
+; SSSE3-NEXT: packuswb %xmm8, %xmm4
+; SSSE3-NEXT: packuswb %xmm4, %xmm7
+; SSSE3-NEXT: packuswb %xmm7, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v16i64_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa 96(%rdi), %xmm8
-; SSE41-NEXT: movdqa 112(%rdi), %xmm9
-; SSE41-NEXT: movdqa 64(%rdi), %xmm10
-; SSE41-NEXT: movdqa 80(%rdi), %xmm11
-; SSE41-NEXT: movdqa (%rdi), %xmm2
+; SSE41-NEXT: movdqa 96(%rdi), %xmm2
+; SSE41-NEXT: movdqa 112(%rdi), %xmm4
+; SSE41-NEXT: movdqa 64(%rdi), %xmm7
+; SSE41-NEXT: movdqa 80(%rdi), %xmm8
+; SSE41-NEXT: movdqa (%rdi), %xmm11
; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: movdqa 32(%rdi), %xmm12
-; SSE41-NEXT: movdqa 48(%rdi), %xmm13
+; SSE41-NEXT: movdqa 32(%rdi), %xmm9
+; SSE41-NEXT: movdqa 48(%rdi), %xmm10
; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255]
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711]
+; SSE41-NEXT: movdqa %xmm5, %xmm12
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm12
+; SSE41-NEXT: movdqa %xmm5, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
+; SSE41-NEXT: pand %xmm12, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm12
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm12
+; SSE41-NEXT: movdqa %xmm11, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm5, %xmm1
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm5, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: por %xmm13, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: packusdw %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm13, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1
+; SSE41-NEXT: packusdw %xmm12, %xmm1
+; SSE41-NEXT: movdqa %xmm10, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm2
-; SSE41-NEXT: movdqa %xmm12, %xmm0
+; SSE41-NEXT: movdqa %xmm5, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm11
+; SSE41-NEXT: movdqa %xmm5, %xmm12
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm12
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm12, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm11
+; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11
+; SSE41-NEXT: movdqa %xmm9, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5
-; SSE41-NEXT: packusdw %xmm2, %xmm5
-; SSE41-NEXT: packusdw %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm11, %xmm0
+; SSE41-NEXT: movdqa %xmm5, %xmm10
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
+; SSE41-NEXT: movdqa %xmm5, %xmm12
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm12
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm12, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm10
+; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10
+; SSE41-NEXT: packusdw %xmm11, %xmm10
+; SSE41-NEXT: packusdw %xmm10, %xmm1
+; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5
-; SSE41-NEXT: movdqa %xmm10, %xmm0
+; SSE41-NEXT: movdqa %xmm5, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm9
+; SSE41-NEXT: movdqa %xmm5, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm9
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9
+; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2
-; SSE41-NEXT: packusdw %xmm5, %xmm2
-; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: movdqa %xmm5, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: movdqa %xmm5, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
+; SSE41-NEXT: packusdw %xmm9, %xmm8
+; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5
-; SSE41-NEXT: pxor %xmm8, %xmm6
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm5, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: movdqa %xmm5, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
-; SSE41-NEXT: packusdw %xmm5, %xmm3
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: packuswb %xmm2, %xmm1
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7
+; SSE41-NEXT: pxor %xmm2, %xmm6
+; SSE41-NEXT: movdqa %xmm5, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: packusdw %xmm7, %xmm3
+; SSE41-NEXT: packusdw %xmm3, %xmm8
+; SSE41-NEXT: packuswb %xmm8, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -4004,38 +4004,38 @@ define <16 x i8> @trunc_usat_v16i32_v16i8(ptr %p0) {
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movdqa 32(%rdi), %xmm1
; SSE2-NEXT: movdqa 48(%rdi), %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm7
; SSE2-NEXT: pxor %xmm4, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm8
+; SSE2-NEXT: por %xmm0, %xmm8
; SSE2-NEXT: movdqa %xmm6, %xmm7
; SSE2-NEXT: pxor %xmm4, %xmm7
-; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm0
; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm6
-; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: packuswb %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: packuswb %xmm6, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: packuswb %xmm7, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v16i32_v16i8:
@@ -4044,38 +4044,38 @@ define <16 x i8> @trunc_usat_v16i32_v16i8(ptr %p0) {
; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
; SSSE3-NEXT: movdqa 32(%rdi), %xmm1
; SSSE3-NEXT: movdqa 48(%rdi), %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255]
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm7
; SSSE3-NEXT: pxor %xmm4, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm0, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
+; SSSE3-NEXT: movdqa %xmm2, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: pandn %xmm3, %xmm8
+; SSSE3-NEXT: por %xmm0, %xmm8
; SSSE3-NEXT: movdqa %xmm6, %xmm7
; SSSE3-NEXT: pxor %xmm4, %xmm7
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm0
+; SSSE3-NEXT: pandn %xmm3, %xmm0
; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm6
-; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: packuswb %xmm8, %xmm0
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: movdqa %xmm2, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm5
+; SSSE3-NEXT: pandn %xmm3, %xmm7
+; SSSE3-NEXT: por %xmm5, %xmm7
; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: packuswb %xmm6, %xmm3
-; SSSE3-NEXT: packuswb %xmm3, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm2
+; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: packuswb %xmm7, %xmm2
+; SSSE3-NEXT: packuswb %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v16i32_v16i8:
@@ -4144,38 +4144,38 @@ define void @trunc_usat_v16i32_v16i8_store(ptr %p0, ptr %p1) {
; SSE2-NEXT: movdqa 16(%rdi), %xmm5
; SSE2-NEXT: movdqa 32(%rdi), %xmm0
; SSE2-NEXT: movdqa 48(%rdi), %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm5, %xmm7
; SSE2-NEXT: pxor %xmm3, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm8
+; SSE2-NEXT: por %xmm5, %xmm8
; SSE2-NEXT: movdqa %xmm6, %xmm7
; SSE2-NEXT: pxor %xmm3, %xmm7
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm5
; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: packuswb %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm6
-; SSE2-NEXT: por %xmm4, %xmm6
+; SSE2-NEXT: packuswb %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm3, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm7
+; SSE2-NEXT: por %xmm4, %xmm7
; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: packuswb %xmm6, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: packuswb %xmm7, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm5
; SSE2-NEXT: movdqa %xmm5, (%rsi)
; SSE2-NEXT: retq
;
@@ -4185,38 +4185,38 @@ define void @trunc_usat_v16i32_v16i8_store(ptr %p0, ptr %p1) {
; SSSE3-NEXT: movdqa 16(%rdi), %xmm5
; SSSE3-NEXT: movdqa 32(%rdi), %xmm0
; SSSE3-NEXT: movdqa 48(%rdi), %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm5, %xmm7
; SSSE3-NEXT: pxor %xmm3, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm5, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903]
+; SSSE3-NEXT: movdqa %xmm1, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pandn %xmm2, %xmm8
+; SSSE3-NEXT: por %xmm5, %xmm8
; SSSE3-NEXT: movdqa %xmm6, %xmm7
; SSSE3-NEXT: pxor %xmm3, %xmm7
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm5
+; SSSE3-NEXT: pandn %xmm2, %xmm5
; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: packuswb %xmm1, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm6
-; SSSE3-NEXT: por %xmm4, %xmm6
+; SSSE3-NEXT: packuswb %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm6
+; SSSE3-NEXT: pxor %xmm3, %xmm6
+; SSSE3-NEXT: movdqa %xmm1, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm4
+; SSSE3-NEXT: pandn %xmm2, %xmm7
+; SSSE3-NEXT: por %xmm4, %xmm7
; SSSE3-NEXT: pxor %xmm0, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: packuswb %xmm6, %xmm2
-; SSSE3-NEXT: packuswb %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: pandn %xmm2, %xmm1
+; SSSE3-NEXT: por %xmm0, %xmm1
+; SSSE3-NEXT: packuswb %xmm7, %xmm1
+; SSSE3-NEXT: packuswb %xmm1, %xmm5
; SSSE3-NEXT: movdqa %xmm5, (%rsi)
; SSSE3-NEXT: retq
;
@@ -4643,152 +4643,152 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) {
define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) {
; SSE2-LABEL: trunc_usat_v32i32_v32i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm11
-; SSE2-NEXT: movdqa 16(%rdi), %xmm12
-; SSE2-NEXT: movdqa 32(%rdi), %xmm9
-; SSE2-NEXT: movdqa 48(%rdi), %xmm10
-; SSE2-NEXT: movdqa 96(%rdi), %xmm0
-; SSE2-NEXT: movdqa 112(%rdi), %xmm2
-; SSE2-NEXT: movdqa 64(%rdi), %xmm5
-; SSE2-NEXT: movdqa 80(%rdi), %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255]
+; SSE2-NEXT: movdqa (%rdi), %xmm7
+; SSE2-NEXT: movdqa 16(%rdi), %xmm0
+; SSE2-NEXT: movdqa 32(%rdi), %xmm2
+; SSE2-NEXT: movdqa 48(%rdi), %xmm5
+; SSE2-NEXT: movdqa 96(%rdi), %xmm8
+; SSE2-NEXT: movdqa 112(%rdi), %xmm9
+; SSE2-NEXT: movdqa 64(%rdi), %xmm10
+; SSE2-NEXT: movdqa 80(%rdi), %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: pxor %xmm6, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: pxor %xmm6, %xmm11
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm3, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm11, %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm12
+; SSE2-NEXT: por %xmm1, %xmm12
+; SSE2-NEXT: movdqa %xmm10, %xmm11
+; SSE2-NEXT: pxor %xmm6, %xmm11
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm11, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm10
+; SSE2-NEXT: pandn %xmm4, %xmm1
+; SSE2-NEXT: por %xmm10, %xmm1
+; SSE2-NEXT: packuswb %xmm12, %xmm1
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pxor %xmm6, %xmm10
+; SSE2-NEXT: movdqa %xmm3, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm11
+; SSE2-NEXT: por %xmm9, %xmm11
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: pxor %xmm6, %xmm9
+; SSE2-NEXT: movdqa %xmm3, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: pandn %xmm4, %xmm10
+; SSE2-NEXT: por %xmm8, %xmm10
+; SSE2-NEXT: packuswb %xmm11, %xmm10
+; SSE2-NEXT: packuswb %xmm10, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: pxor %xmm6, %xmm8
+; SSE2-NEXT: movdqa %xmm3, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm9
+; SSE2-NEXT: por %xmm0, %xmm9
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pxor %xmm6, %xmm8
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: pandn %xmm4, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: packuswb %xmm9, %xmm0
; SSE2-NEXT: movdqa %xmm5, %xmm7
; SSE2-NEXT: pxor %xmm6, %xmm7
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm6, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: packuswb %xmm5, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm12, %xmm0
-; SSE2-NEXT: pxor %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm12
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm12, %xmm2
-; SSE2-NEXT: movdqa %xmm11, %xmm3
-; SSE2-NEXT: pxor %xmm6, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm11
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm11, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: pxor %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm10
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm10, %xmm3
-; SSE2-NEXT: pxor %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm9
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: packuswb %xmm3, %xmm4
-; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm5, %xmm8
+; SSE2-NEXT: pxor %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: packuswb %xmm8, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v32i32_v32i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa (%rdi), %xmm11
-; SSSE3-NEXT: movdqa 16(%rdi), %xmm12
-; SSSE3-NEXT: movdqa 32(%rdi), %xmm9
-; SSSE3-NEXT: movdqa 48(%rdi), %xmm10
-; SSSE3-NEXT: movdqa 96(%rdi), %xmm0
-; SSSE3-NEXT: movdqa 112(%rdi), %xmm2
-; SSSE3-NEXT: movdqa 64(%rdi), %xmm5
-; SSSE3-NEXT: movdqa 80(%rdi), %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255]
+; SSSE3-NEXT: movdqa (%rdi), %xmm7
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
+; SSSE3-NEXT: movdqa 32(%rdi), %xmm2
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm5
+; SSSE3-NEXT: movdqa 96(%rdi), %xmm8
+; SSSE3-NEXT: movdqa 112(%rdi), %xmm9
+; SSSE3-NEXT: movdqa 64(%rdi), %xmm10
+; SSSE3-NEXT: movdqa 80(%rdi), %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255]
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm7, %xmm1
-; SSSE3-NEXT: pxor %xmm6, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm4, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, %xmm11
+; SSSE3-NEXT: pxor %xmm6, %xmm11
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
+; SSSE3-NEXT: movdqa %xmm3, %xmm12
+; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12
+; SSSE3-NEXT: pand %xmm12, %xmm1
+; SSSE3-NEXT: pandn %xmm4, %xmm12
+; SSSE3-NEXT: por %xmm1, %xmm12
+; SSSE3-NEXT: movdqa %xmm10, %xmm11
+; SSSE3-NEXT: pxor %xmm6, %xmm11
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1
+; SSSE3-NEXT: pand %xmm1, %xmm10
+; SSSE3-NEXT: pandn %xmm4, %xmm1
+; SSSE3-NEXT: por %xmm10, %xmm1
+; SSSE3-NEXT: packuswb %xmm12, %xmm1
+; SSSE3-NEXT: movdqa %xmm9, %xmm10
+; SSSE3-NEXT: pxor %xmm6, %xmm10
+; SSSE3-NEXT: movdqa %xmm3, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11
+; SSSE3-NEXT: pand %xmm11, %xmm9
+; SSSE3-NEXT: pandn %xmm4, %xmm11
+; SSSE3-NEXT: por %xmm9, %xmm11
+; SSSE3-NEXT: movdqa %xmm8, %xmm9
+; SSSE3-NEXT: pxor %xmm6, %xmm9
+; SSSE3-NEXT: movdqa %xmm3, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10
+; SSSE3-NEXT: pand %xmm10, %xmm8
+; SSSE3-NEXT: pandn %xmm4, %xmm10
+; SSSE3-NEXT: por %xmm8, %xmm10
+; SSSE3-NEXT: packuswb %xmm11, %xmm10
+; SSSE3-NEXT: packuswb %xmm10, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm8
+; SSSE3-NEXT: pxor %xmm6, %xmm8
+; SSSE3-NEXT: movdqa %xmm3, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT: pand %xmm9, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm9
+; SSSE3-NEXT: por %xmm0, %xmm9
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: pxor %xmm6, %xmm8
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm7
+; SSSE3-NEXT: pandn %xmm4, %xmm0
+; SSSE3-NEXT: por %xmm7, %xmm0
+; SSSE3-NEXT: packuswb %xmm9, %xmm0
; SSSE3-NEXT: movdqa %xmm5, %xmm7
; SSSE3-NEXT: pxor %xmm6, %xmm7
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: packuswb %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pxor %xmm6, %xmm3
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm4, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: packuswb %xmm5, %xmm3
-; SSSE3-NEXT: packuswb %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm12, %xmm0
-; SSSE3-NEXT: pxor %xmm6, %xmm0
-; SSSE3-NEXT: movdqa %xmm4, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm12
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm12, %xmm2
-; SSSE3-NEXT: movdqa %xmm11, %xmm3
-; SSSE3-NEXT: pxor %xmm6, %xmm3
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm11
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm11, %xmm0
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm10, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm4, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm10
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm10, %xmm3
-; SSSE3-NEXT: pxor %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm9
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm9, %xmm4
-; SSSE3-NEXT: packuswb %xmm3, %xmm4
-; SSSE3-NEXT: packuswb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: pandn %xmm4, %xmm8
+; SSSE3-NEXT: por %xmm5, %xmm8
+; SSSE3-NEXT: pxor %xmm2, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: packuswb %xmm8, %xmm3
+; SSSE3-NEXT: packuswb %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v32i32_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index 113fb0985d2d8..9c0615b129c71 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -2148,8 +2148,8 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; SSE2-NEXT: movdqa %xmm0, %xmm5
@@ -2161,17 +2161,17 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: movdqa %xmm1, 112(%rdi)
-; SSE2-NEXT: movdqa %xmm4, 96(%rdi)
+; SSE2-NEXT: movdqa %xmm8, 96(%rdi)
; SSE2-NEXT: movdqa %xmm6, 80(%rdi)
; SSE2-NEXT: movdqa %xmm7, 64(%rdi)
; SSE2-NEXT: movdqa %xmm0, 48(%rdi)
; SSE2-NEXT: movdqa %xmm5, 32(%rdi)
; SSE2-NEXT: movdqa %xmm3, 16(%rdi)
-; SSE2-NEXT: movdqa %xmm8, (%rdi)
+; SSE2-NEXT: movdqa %xmm4, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_32i8_to_32i32:
@@ -2180,8 +2180,8 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: movdqa %xmm3, %xmm8
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; SSSE3-NEXT: movdqa %xmm0, %xmm5
@@ -2193,17 +2193,17 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSSE3-NEXT: movdqa %xmm1, %xmm8
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, 112(%rdi)
-; SSSE3-NEXT: movdqa %xmm4, 96(%rdi)
+; SSSE3-NEXT: movdqa %xmm8, 96(%rdi)
; SSSE3-NEXT: movdqa %xmm6, 80(%rdi)
; SSSE3-NEXT: movdqa %xmm7, 64(%rdi)
; SSSE3-NEXT: movdqa %xmm0, 48(%rdi)
; SSSE3-NEXT: movdqa %xmm5, 32(%rdi)
; SSSE3-NEXT: movdqa %xmm3, 16(%rdi)
-; SSSE3-NEXT: movdqa %xmm8, (%rdi)
+; SSSE3-NEXT: movdqa %xmm4, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_32i8_to_32i32:
diff --git a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
index a1f2f14f3ffd2..7ddff59c26b0e 100644
--- a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
+++ b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
@@ -66,11 +66,10 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1
; X64: # %bb.0: # %entry
; X64-NEXT: pushq %rbp
; X64-NEXT: movq %rsp, %rbp
-; X64-NEXT: pushq %r14
; X64-NEXT: pushq %rbx
; X64-NEXT: andq $-64, %rsp
; X64-NEXT: subq $64, %rsp
-; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: movq %rdi, %rbx
; X64-NEXT: vmovaps 16(%rbp), %zmm8
; X64-NEXT: vp2intersectd %zmm1, %zmm0, %k0
; X64-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
@@ -104,16 +103,15 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1
; X64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; X64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; X64-NEXT: kmovw %k0, %edi
-; X64-NEXT: kmovw %k1, %ebx
+; X64-NEXT: kmovw %k1, %r8d
; X64-NEXT: addl %edi, %eax
; X64-NEXT: addl %ecx, %edx
-; X64-NEXT: addl %ebx, %eax
+; X64-NEXT: addl %r8d, %eax
; X64-NEXT: addl %esi, %eax
; X64-NEXT: addl %edx, %eax
-; X64-NEXT: movw %ax, (%r14)
-; X64-NEXT: leaq -16(%rbp), %rsp
+; X64-NEXT: movw %ax, (%rbx)
+; X64-NEXT: leaq -8(%rbp), %rsp
; X64-NEXT: popq %rbx
-; X64-NEXT: popq %r14
; X64-NEXT: popq %rbp
; X64-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/vselect-minmax.ll b/llvm/test/CodeGen/X86/vselect-minmax.ll
index fcabb8f461062..ee577cf93d5cf 100644
--- a/llvm/test/CodeGen/X86/vselect-minmax.ll
+++ b/llvm/test/CodeGen/X86/vselect-minmax.ll
@@ -4196,14 +4196,14 @@ define <16 x i32> @test117(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
@@ -4271,14 +4271,14 @@ define <16 x i32> @test118(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
@@ -4346,14 +4346,14 @@ define <16 x i32> @test119(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -4421,14 +4421,14 @@ define <16 x i32> @test120(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -4503,21 +4503,21 @@ define <8 x i64> @test121(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
@@ -4551,24 +4551,23 @@ define <8 x i64> @test121(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test121:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm0, %xmm7
+; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4
+; SSE4-NEXT: pcmpgtq %xmm8, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE4-NEXT: movdqa %xmm5, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE4-NEXT: movdqa %xmm6, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
-; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: movdqa %xmm7, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: movapd %xmm5, %xmm1
; SSE4-NEXT: movapd %xmm6, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test121:
@@ -4624,21 +4623,21 @@ define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
@@ -4672,24 +4671,23 @@ define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test122:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm0, %xmm7
+; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4
+; SSE4-NEXT: pcmpgtq %xmm8, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE4-NEXT: movdqa %xmm5, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE4-NEXT: movdqa %xmm6, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
-; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: movdqa %xmm7, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: movapd %xmm5, %xmm1
; SSE4-NEXT: movapd %xmm6, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test122:
@@ -4745,21 +4743,21 @@ define <8 x i64> @test123(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -4793,10 +4791,9 @@ define <8 x i64> @test123(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test123:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm0, %xmm7
+; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4
+; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
@@ -4804,12 +4801,12 @@ define <8 x i64> @test123(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm3, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm8, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: movapd %xmm5, %xmm1
; SSE4-NEXT: movapd %xmm6, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test123:
@@ -4865,21 +4862,21 @@ define <8 x i64> @test124(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -4913,10 +4910,9 @@ define <8 x i64> @test124(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test124:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm0, %xmm7
+; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4
+; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
@@ -4924,12 +4920,12 @@ define <8 x i64> @test124(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm3, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm8, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: movapd %xmm5, %xmm1
; SSE4-NEXT: movapd %xmm6, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test124:
@@ -4985,21 +4981,21 @@ define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
@@ -5033,39 +5029,36 @@ define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test125:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm6, %xmm9
-; SSE4-NEXT: movdqa %xmm5, %xmm10
-; SSE4-NEXT: movdqa %xmm0, %xmm5
-; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: movdqa %xmm0, %xmm6
-; SSE4-NEXT: pxor %xmm7, %xmm6
+; SSE4-NEXT: movdqa %xmm0, %xmm9
+; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: movdqa %xmm0, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm10
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm4
-; SSE4-NEXT: movdqa %xmm1, %xmm5
-; SSE4-NEXT: pxor %xmm7, %xmm5
-; SSE4-NEXT: movdqa %xmm10, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm10, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4
+; SSE4-NEXT: movdqa %xmm1, %xmm9
+; SSE4-NEXT: pxor %xmm8, %xmm9
+; SSE4-NEXT: movdqa %xmm5, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE4-NEXT: movdqa %xmm2, %xmm1
-; SSE4-NEXT: pxor %xmm7, %xmm1
-; SSE4-NEXT: movdqa %xmm9, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm1
+; SSE4-NEXT: movdqa %xmm6, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm3, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pxor %xmm8, %xmm7
-; SSE4-NEXT: pcmpgtq %xmm0, %xmm7
-; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pxor %xmm7, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm0, %xmm8
+; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE4-NEXT: movapd %xmm4, %xmm0
-; SSE4-NEXT: movapd %xmm10, %xmm1
-; SSE4-NEXT: movapd %xmm9, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: movapd %xmm5, %xmm1
+; SSE4-NEXT: movapd %xmm6, %xmm2
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test125:
@@ -5135,21 +5128,21 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
@@ -5183,39 +5176,36 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test126:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm6, %xmm9
-; SSE4-NEXT: movdqa %xmm5, %xmm10
-; SSE4-NEXT: movdqa %xmm0, %xmm5
-; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: movdqa %xmm0, %xmm6
-; SSE4-NEXT: pxor %xmm7, %xmm6
+; SSE4-NEXT: movdqa %xmm0, %xmm9
+; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: movdqa %xmm0, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm10
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm4
-; SSE4-NEXT: movdqa %xmm1, %xmm5
-; SSE4-NEXT: pxor %xmm7, %xmm5
-; SSE4-NEXT: movdqa %xmm10, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm10, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4
+; SSE4-NEXT: movdqa %xmm1, %xmm9
+; SSE4-NEXT: pxor %xmm8, %xmm9
+; SSE4-NEXT: movdqa %xmm5, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE4-NEXT: movdqa %xmm2, %xmm1
-; SSE4-NEXT: pxor %xmm7, %xmm1
-; SSE4-NEXT: movdqa %xmm9, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm1
+; SSE4-NEXT: movdqa %xmm6, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm3, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pxor %xmm8, %xmm7
-; SSE4-NEXT: pcmpgtq %xmm0, %xmm7
-; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pxor %xmm7, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm0, %xmm8
+; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE4-NEXT: movapd %xmm4, %xmm0
-; SSE4-NEXT: movapd %xmm10, %xmm1
-; SSE4-NEXT: movapd %xmm9, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: movapd %xmm5, %xmm1
+; SSE4-NEXT: movapd %xmm6, %xmm2
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test126:
@@ -5285,21 +5275,21 @@ define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -5333,38 +5323,35 @@ define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test127:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm6, %xmm9
-; SSE4-NEXT: movdqa %xmm5, %xmm10
-; SSE4-NEXT: movdqa %xmm4, %xmm5
-; SSE4-NEXT: movdqa %xmm0, %xmm6
-; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: pxor %xmm7, %xmm4
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm5
-; SSE4-NEXT: movdqa %xmm10, %xmm4
-; SSE4-NEXT: pxor %xmm7, %xmm4
+; SSE4-NEXT: movdqa %xmm0, %xmm9
+; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: movdqa %xmm4, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm10, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4
+; SSE4-NEXT: movdqa %xmm5, %xmm9
+; SSE4-NEXT: pxor %xmm8, %xmm9
; SSE4-NEXT: movdqa %xmm1, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10
-; SSE4-NEXT: movdqa %xmm9, %xmm1
-; SSE4-NEXT: pxor %xmm7, %xmm1
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
+; SSE4-NEXT: movdqa %xmm6, %xmm1
+; SSE4-NEXT: pxor %xmm8, %xmm1
; SSE4-NEXT: movdqa %xmm2, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9
-; SSE4-NEXT: movdqa %xmm8, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pxor %xmm3, %xmm7
-; SSE4-NEXT: pcmpgtq %xmm0, %xmm7
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
-; SSE4-NEXT: movapd %xmm5, %xmm0
-; SSE4-NEXT: movapd %xmm10, %xmm1
-; SSE4-NEXT: movapd %xmm9, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pxor %xmm3, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm0, %xmm8
+; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT: movapd %xmm4, %xmm0
+; SSE4-NEXT: movapd %xmm5, %xmm1
+; SSE4-NEXT: movapd %xmm6, %xmm2
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test127:
@@ -5434,21 +5421,21 @@ define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -5482,38 +5469,35 @@ define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test128:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm6, %xmm9
-; SSE4-NEXT: movdqa %xmm5, %xmm10
-; SSE4-NEXT: movdqa %xmm4, %xmm5
-; SSE4-NEXT: movdqa %xmm0, %xmm6
-; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: pxor %xmm7, %xmm4
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm5
-; SSE4-NEXT: movdqa %xmm10, %xmm4
-; SSE4-NEXT: pxor %xmm7, %xmm4
+; SSE4-NEXT: movdqa %xmm0, %xmm9
+; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: movdqa %xmm4, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm10, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4
+; SSE4-NEXT: movdqa %xmm5, %xmm9
+; SSE4-NEXT: pxor %xmm8, %xmm9
; SSE4-NEXT: movdqa %xmm1, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10
-; SSE4-NEXT: movdqa %xmm9, %xmm1
-; SSE4-NEXT: pxor %xmm7, %xmm1
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
+; SSE4-NEXT: movdqa %xmm6, %xmm1
+; SSE4-NEXT: pxor %xmm8, %xmm1
; SSE4-NEXT: movdqa %xmm2, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9
-; SSE4-NEXT: movdqa %xmm8, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pxor %xmm3, %xmm7
-; SSE4-NEXT: pcmpgtq %xmm0, %xmm7
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
-; SSE4-NEXT: movapd %xmm5, %xmm0
-; SSE4-NEXT: movapd %xmm10, %xmm1
-; SSE4-NEXT: movapd %xmm9, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pxor %xmm3, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm0, %xmm8
+; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT: movapd %xmm4, %xmm0
+; SSE4-NEXT: movapd %xmm5, %xmm1
+; SSE4-NEXT: movapd %xmm6, %xmm2
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test128:
@@ -6748,14 +6732,14 @@ define <16 x i32> @test149(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -6823,14 +6807,14 @@ define <16 x i32> @test150(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -6898,14 +6882,14 @@ define <16 x i32> @test151(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
@@ -6973,14 +6957,14 @@ define <16 x i32> @test152(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
@@ -7057,21 +7041,21 @@ define <8 x i64> @test153(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -7105,10 +7089,9 @@ define <8 x i64> @test153(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test153:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm0, %xmm7
+; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4
+; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
@@ -7116,12 +7099,12 @@ define <8 x i64> @test153(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm3, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm8, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: movapd %xmm5, %xmm1
; SSE4-NEXT: movapd %xmm6, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test153:
@@ -7177,21 +7160,21 @@ define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -7225,10 +7208,9 @@ define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test154:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm0, %xmm7
+; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4
+; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
@@ -7236,12 +7218,12 @@ define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm3, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm8, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: movapd %xmm5, %xmm1
; SSE4-NEXT: movapd %xmm6, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test154:
@@ -7297,21 +7279,21 @@ define <8 x i64> @test155(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
@@ -7345,24 +7327,23 @@ define <8 x i64> @test155(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test155:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm0, %xmm7
+; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4
+; SSE4-NEXT: pcmpgtq %xmm8, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE4-NEXT: movdqa %xmm5, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE4-NEXT: movdqa %xmm6, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
-; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: movdqa %xmm7, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: movapd %xmm5, %xmm1
; SSE4-NEXT: movapd %xmm6, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test155:
@@ -7418,21 +7399,21 @@ define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -7466,38 +7447,35 @@ define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test156:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm6, %xmm9
-; SSE4-NEXT: movdqa %xmm5, %xmm10
-; SSE4-NEXT: movdqa %xmm4, %xmm5
-; SSE4-NEXT: movdqa %xmm0, %xmm6
-; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: pxor %xmm7, %xmm4
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm5
-; SSE4-NEXT: movdqa %xmm10, %xmm4
-; SSE4-NEXT: pxor %xmm7, %xmm4
+; SSE4-NEXT: movdqa %xmm0, %xmm9
+; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: movdqa %xmm4, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm10, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4
+; SSE4-NEXT: movdqa %xmm5, %xmm9
+; SSE4-NEXT: pxor %xmm8, %xmm9
; SSE4-NEXT: movdqa %xmm1, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10
-; SSE4-NEXT: movdqa %xmm9, %xmm1
-; SSE4-NEXT: pxor %xmm7, %xmm1
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
+; SSE4-NEXT: movdqa %xmm6, %xmm1
+; SSE4-NEXT: pxor %xmm8, %xmm1
; SSE4-NEXT: movdqa %xmm2, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9
-; SSE4-NEXT: movdqa %xmm8, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pxor %xmm3, %xmm7
-; SSE4-NEXT: pcmpgtq %xmm0, %xmm7
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
-; SSE4-NEXT: movapd %xmm5, %xmm0
-; SSE4-NEXT: movapd %xmm10, %xmm1
-; SSE4-NEXT: movapd %xmm9, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pxor %xmm3, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm0, %xmm8
+; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT: movapd %xmm4, %xmm0
+; SSE4-NEXT: movapd %xmm5, %xmm1
+; SSE4-NEXT: movapd %xmm6, %xmm2
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test156:
@@ -7567,21 +7545,21 @@ define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
@@ -7615,39 +7593,36 @@ define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test159:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm6, %xmm9
-; SSE4-NEXT: movdqa %xmm5, %xmm10
-; SSE4-NEXT: movdqa %xmm0, %xmm5
-; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: movdqa %xmm0, %xmm6
-; SSE4-NEXT: pxor %xmm7, %xmm6
+; SSE4-NEXT: movdqa %xmm0, %xmm9
+; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: movdqa %xmm0, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm10
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm4
-; SSE4-NEXT: movdqa %xmm1, %xmm5
-; SSE4-NEXT: pxor %xmm7, %xmm5
-; SSE4-NEXT: movdqa %xmm10, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm10, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4
+; SSE4-NEXT: movdqa %xmm1, %xmm9
+; SSE4-NEXT: pxor %xmm8, %xmm9
+; SSE4-NEXT: movdqa %xmm5, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE4-NEXT: movdqa %xmm2, %xmm1
-; SSE4-NEXT: pxor %xmm7, %xmm1
-; SSE4-NEXT: movdqa %xmm9, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm1
+; SSE4-NEXT: movdqa %xmm6, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm3, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pxor %xmm8, %xmm7
-; SSE4-NEXT: pcmpgtq %xmm0, %xmm7
-; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pxor %xmm7, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm0, %xmm8
+; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE4-NEXT: movapd %xmm4, %xmm0
-; SSE4-NEXT: movapd %xmm10, %xmm1
-; SSE4-NEXT: movapd %xmm9, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: movapd %xmm5, %xmm1
+; SSE4-NEXT: movapd %xmm6, %xmm2
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test159:
@@ -7717,21 +7692,21 @@ define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
@@ -7765,39 +7740,36 @@ define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) {
;
; SSE4-LABEL: test160:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqa %xmm7, %xmm8
-; SSE4-NEXT: movdqa %xmm6, %xmm9
-; SSE4-NEXT: movdqa %xmm5, %xmm10
-; SSE4-NEXT: movdqa %xmm0, %xmm5
-; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: movdqa %xmm0, %xmm6
-; SSE4-NEXT: pxor %xmm7, %xmm6
+; SSE4-NEXT: movdqa %xmm0, %xmm9
+; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: movdqa %xmm0, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm10
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm4
-; SSE4-NEXT: movdqa %xmm1, %xmm5
-; SSE4-NEXT: pxor %xmm7, %xmm5
-; SSE4-NEXT: movdqa %xmm10, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm10, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4
+; SSE4-NEXT: movdqa %xmm1, %xmm9
+; SSE4-NEXT: pxor %xmm8, %xmm9
+; SSE4-NEXT: movdqa %xmm5, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE4-NEXT: movdqa %xmm2, %xmm1
-; SSE4-NEXT: pxor %xmm7, %xmm1
-; SSE4-NEXT: movdqa %xmm9, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm1
+; SSE4-NEXT: movdqa %xmm6, %xmm0
+; SSE4-NEXT: pxor %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm3, %xmm0
-; SSE4-NEXT: pxor %xmm7, %xmm0
-; SSE4-NEXT: pxor %xmm8, %xmm7
-; SSE4-NEXT: pcmpgtq %xmm0, %xmm7
-; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT: pxor %xmm8, %xmm0
+; SSE4-NEXT: pxor %xmm7, %xmm8
+; SSE4-NEXT: pcmpgtq %xmm0, %xmm8
+; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE4-NEXT: movapd %xmm4, %xmm0
-; SSE4-NEXT: movapd %xmm10, %xmm1
-; SSE4-NEXT: movapd %xmm9, %xmm2
-; SSE4-NEXT: movapd %xmm8, %xmm3
+; SSE4-NEXT: movapd %xmm5, %xmm1
+; SSE4-NEXT: movapd %xmm6, %xmm2
+; SSE4-NEXT: movapd %xmm7, %xmm3
; SSE4-NEXT: retq
;
; AVX1-LABEL: test160:
@@ -10288,53 +10260,53 @@ entry:
define <8 x i64> @concat_smin_smax(<4 x i64> %a0, <4 x i64> %a1) {
; SSE2-LABEL: concat_smin_smax:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm11, %xmm6
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: pxor %xmm11, %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: pxor %xmm7, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm0
; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-NEXT: movdqa %xmm9, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm8, %xmm4
-; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm0, %xmm5
; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm11, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm11
-; SSE2-NEXT: movdqa %xmm11, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
-; SSE2-NEXT: movdqa %xmm11, %xmm7
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm12
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm7, %xmm9
+; SSE2-NEXT: pxor %xmm3, %xmm7
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm7, %xmm12
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm12, %xmm5
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: pand %xmm5, %xmm7
+; SSE2-NEXT: por %xmm11, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: pand %xmm5, %xmm11
; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm7, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: por %xmm11, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE2-NEXT: pand %xmm10, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm8
-; SSE2-NEXT: pandn %xmm2, %xmm6
; SSE2-NEXT: por %xmm8, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2]
-; SSE2-NEXT: pand %xmm13, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: por %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2]
+; SSE2-NEXT: pand %xmm12, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm4
@@ -10346,7 +10318,7 @@ define <8 x i64> @concat_smin_smax(<4 x i64> %a0, <4 x i64> %a1) {
;
; SSE4-LABEL: concat_smin_smax:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm8
+; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa %xmm0, %xmm5
; SSE4-NEXT: pcmpgtq %xmm2, %xmm5
; SSE4-NEXT: movdqa %xmm0, %xmm6
@@ -10354,15 +10326,15 @@ define <8 x i64> @concat_smin_smax(<4 x i64> %a0, <4 x i64> %a1) {
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: movdqa %xmm1, %xmm7
; SSE4-NEXT: pcmpgtq %xmm3, %xmm7
-; SSE4-NEXT: movdqa %xmm1, %xmm4
+; SSE4-NEXT: movdqa %xmm1, %xmm8
; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm4
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8
; SSE4-NEXT: movdqa %xmm5, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm2
+; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE4-NEXT: movdqa %xmm7, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE4-NEXT: movapd %xmm6, %xmm0
-; SSE4-NEXT: movapd %xmm4, %xmm1
+; SSE4-NEXT: movapd %xmm8, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: concat_smin_smax:
diff --git a/llvm/test/CodeGen/X86/vselect-packss.ll b/llvm/test/CodeGen/X86/vselect-packss.ll
index 5e88143134cb0..f31b986ceb262 100644
--- a/llvm/test/CodeGen/X86/vselect-packss.ll
+++ b/llvm/test/CodeGen/X86/vselect-packss.ll
@@ -264,13 +264,13 @@ define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
; AVX1-NEXT: vpcmpeqq %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm8
+; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
+; AVX1-NEXT: vpcmpeqq %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpcmpeqq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpackssdw %xmm8, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-NEXT: vpcmpeqq %xmm3, %xmm6, %xmm3
diff --git a/llvm/test/CodeGen/X86/x86-cmov-converter.ll b/llvm/test/CodeGen/X86/x86-cmov-converter.ll
index 0194c1aecc558..9d55df0602f99 100644
--- a/llvm/test/CodeGen/X86/x86-cmov-converter.ll
+++ b/llvm/test/CodeGen/X86/x86-cmov-converter.ll
@@ -109,25 +109,25 @@ define void @CmovInHotPath(i32 %n, i32 %a, i32 %b, ptr nocapture %c, ptr nocaptu
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: jle .LBB0_5
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: movl %edi, %r8d
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: .LBB0_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl (%rcx,%rdi,4), %eax
-; CHECK-NEXT: leal 1(%rax), %r9d
-; CHECK-NEXT: imull %esi, %eax
-; CHECK-NEXT: movl $10, %r10d
-; CHECK-NEXT: cmpl %edx, %eax
+; CHECK-NEXT: movl (%rcx,%rdi,4), %r10d
+; CHECK-NEXT: leal 1(%r10), %r8d
+; CHECK-NEXT: imull %esi, %r10d
+; CHECK-NEXT: movl $10, %r9d
+; CHECK-NEXT: cmpl %edx, %r10d
; CHECK-NEXT: jg .LBB0_4
; CHECK-NEXT: # %bb.3: # %for.body
; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT: movl %r9d, %r10d
+; CHECK-NEXT: movl %r8d, %r9d
; CHECK-NEXT: .LBB0_4: # %for.body
; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT: imull %r9d, %r10d
-; CHECK-NEXT: movl %r10d, (%rcx,%rdi,4)
+; CHECK-NEXT: imull %r8d, %r9d
+; CHECK-NEXT: movl %r9d, (%rcx,%rdi,4)
; CHECK-NEXT: addq $1, %rdi
-; CHECK-NEXT: cmpq %rdi, %r8
+; CHECK-NEXT: cmpq %rdi, %rax
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: .LBB0_5: # %for.cond.cleanup
; CHECK-NEXT: retq
@@ -137,25 +137,25 @@ define void @CmovInHotPath(i32 %n, i32 %a, i32 %b, ptr nocapture %c, ptr nocaptu
; CHECK-FORCEALL-NEXT: testl %edi, %edi
; CHECK-FORCEALL-NEXT: jle .LBB0_5
; CHECK-FORCEALL-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-FORCEALL-NEXT: movl %edi, %r8d
+; CHECK-FORCEALL-NEXT: movl %edi, %eax
; CHECK-FORCEALL-NEXT: xorl %edi, %edi
; CHECK-FORCEALL-NEXT: .LBB0_2: # %for.body
; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-FORCEALL-NEXT: movl (%rcx,%rdi,4), %eax
-; CHECK-FORCEALL-NEXT: leal 1(%rax), %r9d
-; CHECK-FORCEALL-NEXT: imull %esi, %eax
-; CHECK-FORCEALL-NEXT: movl $10, %r10d
-; CHECK-FORCEALL-NEXT: cmpl %edx, %eax
+; CHECK-FORCEALL-NEXT: movl (%rcx,%rdi,4), %r10d
+; CHECK-FORCEALL-NEXT: leal 1(%r10), %r8d
+; CHECK-FORCEALL-NEXT: imull %esi, %r10d
+; CHECK-FORCEALL-NEXT: movl $10, %r9d
+; CHECK-FORCEALL-NEXT: cmpl %edx, %r10d
; CHECK-FORCEALL-NEXT: jg .LBB0_4
; CHECK-FORCEALL-NEXT: # %bb.3: # %for.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB0_2 Depth=1
-; CHECK-FORCEALL-NEXT: movl %r9d, %r10d
+; CHECK-FORCEALL-NEXT: movl %r8d, %r9d
; CHECK-FORCEALL-NEXT: .LBB0_4: # %for.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB0_2 Depth=1
-; CHECK-FORCEALL-NEXT: imull %r9d, %r10d
-; CHECK-FORCEALL-NEXT: movl %r10d, (%rcx,%rdi,4)
+; CHECK-FORCEALL-NEXT: imull %r8d, %r9d
+; CHECK-FORCEALL-NEXT: movl %r9d, (%rcx,%rdi,4)
; CHECK-FORCEALL-NEXT: addq $1, %rdi
-; CHECK-FORCEALL-NEXT: cmpq %rdi, %r8
+; CHECK-FORCEALL-NEXT: cmpq %rdi, %rax
; CHECK-FORCEALL-NEXT: jne .LBB0_2
; CHECK-FORCEALL-NEXT: .LBB0_5: # %for.cond.cleanup
; CHECK-FORCEALL-NEXT: retq
@@ -192,23 +192,23 @@ define void @CmovNotInHotPath(i32 %n, i32 %a, i32 %b, ptr nocapture %c, ptr noca
; CHECK-NEXT: jle .LBB1_3
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: movl %edx, %r9d
-; CHECK-NEXT: movl %edi, %r10d
-; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: movl %edi, %edi
+; CHECK-NEXT: xorl %r10d, %r10d
; CHECK-NEXT: movl $10, %r11d
; CHECK-NEXT: .LBB1_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl (%rcx,%rdi,4), %eax
+; CHECK-NEXT: movl (%rcx,%r10,4), %eax
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: imull %esi, %edx
; CHECK-NEXT: cmpl %r9d, %edx
; CHECK-NEXT: cmovgl %r11d, %eax
-; CHECK-NEXT: movl %eax, (%rcx,%rdi,4)
-; CHECK-NEXT: movl (%r8,%rdi,4), %eax
+; CHECK-NEXT: movl %eax, (%rcx,%r10,4)
+; CHECK-NEXT: movl (%r8,%r10,4), %eax
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %r9d
-; CHECK-NEXT: movl %eax, (%r8,%rdi,4)
-; CHECK-NEXT: addq $1, %rdi
-; CHECK-NEXT: cmpq %rdi, %r10
+; CHECK-NEXT: movl %eax, (%r8,%r10,4)
+; CHECK-NEXT: addq $1, %r10
+; CHECK-NEXT: cmpq %r10, %rdi
; CHECK-NEXT: jne .LBB1_2
; CHECK-NEXT: .LBB1_3: # %for.cond.cleanup
; CHECK-NEXT: retq
@@ -219,28 +219,28 @@ define void @CmovNotInHotPath(i32 %n, i32 %a, i32 %b, ptr nocapture %c, ptr noca
; CHECK-FORCEALL-NEXT: jle .LBB1_5
; CHECK-FORCEALL-NEXT: # %bb.1: # %for.body.preheader
; CHECK-FORCEALL-NEXT: movl %edx, %r9d
-; CHECK-FORCEALL-NEXT: movl %edi, %r10d
-; CHECK-FORCEALL-NEXT: xorl %edi, %edi
+; CHECK-FORCEALL-NEXT: movl %edi, %edi
+; CHECK-FORCEALL-NEXT: xorl %r10d, %r10d
; CHECK-FORCEALL-NEXT: .LBB1_2: # %for.body
; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-FORCEALL-NEXT: movl (%rcx,%rdi,4), %r11d
-; CHECK-FORCEALL-NEXT: movl %r11d, %eax
-; CHECK-FORCEALL-NEXT: imull %esi, %eax
+; CHECK-FORCEALL-NEXT: movl (%rcx,%r10,4), %eax
+; CHECK-FORCEALL-NEXT: movl %eax, %r11d
+; CHECK-FORCEALL-NEXT: imull %esi, %r11d
; CHECK-FORCEALL-NEXT: movl $10, %edx
-; CHECK-FORCEALL-NEXT: cmpl %r9d, %eax
+; CHECK-FORCEALL-NEXT: cmpl %r9d, %r11d
; CHECK-FORCEALL-NEXT: jg .LBB1_4
; CHECK-FORCEALL-NEXT: # %bb.3: # %for.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-FORCEALL-NEXT: movl %r11d, %edx
+; CHECK-FORCEALL-NEXT: movl %eax, %edx
; CHECK-FORCEALL-NEXT: .LBB1_4: # %for.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-FORCEALL-NEXT: movl %edx, (%rcx,%rdi,4)
-; CHECK-FORCEALL-NEXT: movl (%r8,%rdi,4), %eax
+; CHECK-FORCEALL-NEXT: movl %edx, (%rcx,%r10,4)
+; CHECK-FORCEALL-NEXT: movl (%r8,%r10,4), %eax
; CHECK-FORCEALL-NEXT: cltd
; CHECK-FORCEALL-NEXT: idivl %r9d
-; CHECK-FORCEALL-NEXT: movl %eax, (%r8,%rdi,4)
-; CHECK-FORCEALL-NEXT: addq $1, %rdi
-; CHECK-FORCEALL-NEXT: cmpq %rdi, %r10
+; CHECK-FORCEALL-NEXT: movl %eax, (%r8,%r10,4)
+; CHECK-FORCEALL-NEXT: addq $1, %r10
+; CHECK-FORCEALL-NEXT: cmpq %r10, %rdi
; CHECK-FORCEALL-NEXT: jne .LBB1_2
; CHECK-FORCEALL-NEXT: .LBB1_5: # %for.cond.cleanup
; CHECK-FORCEALL-NEXT: retq
@@ -279,15 +279,15 @@ define i32 @MaxIndex(i32 %n, ptr nocapture readonly %a) #0 {
; CHECK-NEXT: cmpl $2, %edi
; CHECK-NEXT: jl .LBB2_5
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: movl %edi, %r8d
+; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: movl $1, %edx
; CHECK-NEXT: .LBB2_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl (%rsi,%rdx,4), %r9d
-; CHECK-NEXT: movslq %edi, %rcx
+; CHECK-NEXT: movl (%rsi,%rdx,4), %r8d
+; CHECK-NEXT: movslq %edi, %r9
; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: cmpl (%rsi,%rcx,4), %r9d
+; CHECK-NEXT: cmpl (%rsi,%r9,4), %r8d
; CHECK-NEXT: jg .LBB2_4
; CHECK-NEXT: # %bb.3: # %for.body
; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
@@ -296,7 +296,7 @@ define i32 @MaxIndex(i32 %n, ptr nocapture readonly %a) #0 {
; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
; CHECK-NEXT: addq $1, %rdx
; CHECK-NEXT: movl %eax, %edi
-; CHECK-NEXT: cmpq %rdx, %r8
+; CHECK-NEXT: cmpq %rdx, %rcx
; CHECK-NEXT: jne .LBB2_2
; CHECK-NEXT: .LBB2_5: # %for.cond.cleanup
; CHECK-NEXT: retq
@@ -307,15 +307,15 @@ define i32 @MaxIndex(i32 %n, ptr nocapture readonly %a) #0 {
; CHECK-FORCEALL-NEXT: cmpl $2, %edi
; CHECK-FORCEALL-NEXT: jl .LBB2_5
; CHECK-FORCEALL-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-FORCEALL-NEXT: movl %edi, %r8d
+; CHECK-FORCEALL-NEXT: movl %edi, %ecx
; CHECK-FORCEALL-NEXT: xorl %edi, %edi
; CHECK-FORCEALL-NEXT: movl $1, %edx
; CHECK-FORCEALL-NEXT: .LBB2_2: # %for.body
; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-FORCEALL-NEXT: movl (%rsi,%rdx,4), %r9d
-; CHECK-FORCEALL-NEXT: movslq %edi, %rcx
+; CHECK-FORCEALL-NEXT: movl (%rsi,%rdx,4), %r8d
+; CHECK-FORCEALL-NEXT: movslq %edi, %r9
; CHECK-FORCEALL-NEXT: movl %edx, %eax
-; CHECK-FORCEALL-NEXT: cmpl (%rsi,%rcx,4), %r9d
+; CHECK-FORCEALL-NEXT: cmpl (%rsi,%r9,4), %r8d
; CHECK-FORCEALL-NEXT: jg .LBB2_4
; CHECK-FORCEALL-NEXT: # %bb.3: # %for.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB2_2 Depth=1
@@ -324,7 +324,7 @@ define i32 @MaxIndex(i32 %n, ptr nocapture readonly %a) #0 {
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB2_2 Depth=1
; CHECK-FORCEALL-NEXT: addq $1, %rdx
; CHECK-FORCEALL-NEXT: movl %eax, %edi
-; CHECK-FORCEALL-NEXT: cmpq %rdx, %r8
+; CHECK-FORCEALL-NEXT: cmpq %rdx, %rcx
; CHECK-FORCEALL-NEXT: jne .LBB2_2
; CHECK-FORCEALL-NEXT: .LBB2_5: # %for.cond.cleanup
; CHECK-FORCEALL-NEXT: retq
@@ -364,15 +364,15 @@ define i32 @MaxIndex_unpredictable(i32 %n, ptr nocapture readonly %a) #0 {
; CHECK-NEXT: cmpl $2, %edi
; CHECK-NEXT: jl .LBB3_5
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: movl %edi, %r8d
+; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: movl $1, %edx
; CHECK-NEXT: .LBB3_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl (%rsi,%rdx,4), %r9d
-; CHECK-NEXT: movslq %edi, %rcx
+; CHECK-NEXT: movl (%rsi,%rdx,4), %r8d
+; CHECK-NEXT: movslq %edi, %r9
; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: cmpl (%rsi,%rcx,4), %r9d
+; CHECK-NEXT: cmpl (%rsi,%r9,4), %r8d
; CHECK-NEXT: jg .LBB3_4
; CHECK-NEXT: # %bb.3: # %for.body
; CHECK-NEXT: # in Loop: Header=BB3_2 Depth=1
@@ -381,7 +381,7 @@ define i32 @MaxIndex_unpredictable(i32 %n, ptr nocapture readonly %a) #0 {
; CHECK-NEXT: # in Loop: Header=BB3_2 Depth=1
; CHECK-NEXT: addq $1, %rdx
; CHECK-NEXT: movl %eax, %edi
-; CHECK-NEXT: cmpq %rdx, %r8
+; CHECK-NEXT: cmpq %rdx, %rcx
; CHECK-NEXT: jne .LBB3_2
; CHECK-NEXT: .LBB3_5: # %for.cond.cleanup
; CHECK-NEXT: retq
@@ -392,15 +392,15 @@ define i32 @MaxIndex_unpredictable(i32 %n, ptr nocapture readonly %a) #0 {
; CHECK-FORCEALL-NEXT: cmpl $2, %edi
; CHECK-FORCEALL-NEXT: jl .LBB3_5
; CHECK-FORCEALL-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-FORCEALL-NEXT: movl %edi, %r8d
+; CHECK-FORCEALL-NEXT: movl %edi, %ecx
; CHECK-FORCEALL-NEXT: xorl %edi, %edi
; CHECK-FORCEALL-NEXT: movl $1, %edx
; CHECK-FORCEALL-NEXT: .LBB3_2: # %for.body
; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-FORCEALL-NEXT: movl (%rsi,%rdx,4), %r9d
-; CHECK-FORCEALL-NEXT: movslq %edi, %rcx
+; CHECK-FORCEALL-NEXT: movl (%rsi,%rdx,4), %r8d
+; CHECK-FORCEALL-NEXT: movslq %edi, %r9
; CHECK-FORCEALL-NEXT: movl %edx, %eax
-; CHECK-FORCEALL-NEXT: cmpl (%rsi,%rcx,4), %r9d
+; CHECK-FORCEALL-NEXT: cmpl (%rsi,%r9,4), %r8d
; CHECK-FORCEALL-NEXT: jg .LBB3_4
; CHECK-FORCEALL-NEXT: # %bb.3: # %for.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB3_2 Depth=1
@@ -409,7 +409,7 @@ define i32 @MaxIndex_unpredictable(i32 %n, ptr nocapture readonly %a) #0 {
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB3_2 Depth=1
; CHECK-FORCEALL-NEXT: addq $1, %rdx
; CHECK-FORCEALL-NEXT: movl %eax, %edi
-; CHECK-FORCEALL-NEXT: cmpq %rdx, %r8
+; CHECK-FORCEALL-NEXT: cmpq %rdx, %rcx
; CHECK-FORCEALL-NEXT: jne .LBB3_2
; CHECK-FORCEALL-NEXT: .LBB3_5: # %for.cond.cleanup
; CHECK-FORCEALL-NEXT: retq
@@ -463,29 +463,29 @@ define i32 @MaxValue(i32 %n, ptr nocapture readonly %a) #0 {
;
; CHECK-FORCEALL-LABEL: MaxValue:
; CHECK-FORCEALL: # %bb.0: # %entry
-; CHECK-FORCEALL-NEXT: movl (%rsi), %ecx
+; CHECK-FORCEALL-NEXT: movl (%rsi), %r8d
; CHECK-FORCEALL-NEXT: cmpl $2, %edi
; CHECK-FORCEALL-NEXT: jge .LBB4_3
; CHECK-FORCEALL-NEXT: # %bb.1:
-; CHECK-FORCEALL-NEXT: movl %ecx, %eax
+; CHECK-FORCEALL-NEXT: movl %r8d, %eax
; CHECK-FORCEALL-NEXT: .LBB4_2: # %for.cond.cleanup
; CHECK-FORCEALL-NEXT: retq
; CHECK-FORCEALL-NEXT: .LBB4_3: # %for.body.preheader
-; CHECK-FORCEALL-NEXT: movl %edi, %edi
+; CHECK-FORCEALL-NEXT: movl %edi, %ecx
; CHECK-FORCEALL-NEXT: movl $1, %edx
; CHECK-FORCEALL-NEXT: .LBB4_4: # %for.body
; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-FORCEALL-NEXT: movl (%rsi,%rdx,4), %eax
-; CHECK-FORCEALL-NEXT: cmpl %ecx, %eax
+; CHECK-FORCEALL-NEXT: cmpl %r8d, %eax
; CHECK-FORCEALL-NEXT: jg .LBB4_6
; CHECK-FORCEALL-NEXT: # %bb.5: # %for.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB4_4 Depth=1
-; CHECK-FORCEALL-NEXT: movl %ecx, %eax
+; CHECK-FORCEALL-NEXT: movl %r8d, %eax
; CHECK-FORCEALL-NEXT: .LBB4_6: # %for.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB4_4 Depth=1
; CHECK-FORCEALL-NEXT: addq $1, %rdx
-; CHECK-FORCEALL-NEXT: movl %eax, %ecx
-; CHECK-FORCEALL-NEXT: cmpq %rdx, %rdi
+; CHECK-FORCEALL-NEXT: movl %eax, %r8d
+; CHECK-FORCEALL-NEXT: cmpq %rdx, %rcx
; CHECK-FORCEALL-NEXT: je .LBB4_2
; CHECK-FORCEALL-NEXT: jmp .LBB4_4
entry:
@@ -611,27 +611,27 @@ define void @Transform(ptr%arr, ptr%arr2, i32 %a, i32 %b, i32 %c, i32 %n) #0 {
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB6_5
; CHECK-NEXT: # %bb.1: # %while.body.preheader
-; CHECK-NEXT: movl %edx, %r8d
+; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: .LBB6_2: # %while.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movslq %esi, %rsi
; CHECK-NEXT: movl (%rdi,%rsi,4), %eax
; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divl %r8d
+; CHECK-NEXT: divl %ecx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: movl $11, %eax
-; CHECK-NEXT: movl %r8d, %ecx
-; CHECK-NEXT: cmpl %r8d, %edx
+; CHECK-NEXT: movl %ecx, %r8d
+; CHECK-NEXT: cmpl %ecx, %edx
; CHECK-NEXT: ja .LBB6_4
; CHECK-NEXT: # %bb.3: # %while.body
; CHECK-NEXT: # in Loop: Header=BB6_2 Depth=1
; CHECK-NEXT: movl $22, %eax
-; CHECK-NEXT: movl $22, %ecx
+; CHECK-NEXT: movl $22, %r8d
; CHECK-NEXT: .LBB6_4: # %while.body
; CHECK-NEXT: # in Loop: Header=BB6_2 Depth=1
; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: divl %r8d
; CHECK-NEXT: movl %edx, (%rdi,%rsi,4)
; CHECK-NEXT: addl $1, %esi
; CHECK-NEXT: cmpl %r9d, %esi
@@ -645,27 +645,27 @@ define void @Transform(ptr%arr, ptr%arr2, i32 %a, i32 %b, i32 %c, i32 %n) #0 {
; CHECK-FORCEALL-NEXT: testb %al, %al
; CHECK-FORCEALL-NEXT: jne .LBB6_5
; CHECK-FORCEALL-NEXT: # %bb.1: # %while.body.preheader
-; CHECK-FORCEALL-NEXT: movl %edx, %r8d
+; CHECK-FORCEALL-NEXT: movl %edx, %ecx
; CHECK-FORCEALL-NEXT: xorl %esi, %esi
; CHECK-FORCEALL-NEXT: .LBB6_2: # %while.body
; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-FORCEALL-NEXT: movslq %esi, %rsi
; CHECK-FORCEALL-NEXT: movl (%rdi,%rsi,4), %eax
; CHECK-FORCEALL-NEXT: xorl %edx, %edx
-; CHECK-FORCEALL-NEXT: divl %r8d
+; CHECK-FORCEALL-NEXT: divl %ecx
; CHECK-FORCEALL-NEXT: movl %eax, %edx
; CHECK-FORCEALL-NEXT: movl $11, %eax
-; CHECK-FORCEALL-NEXT: movl %r8d, %ecx
-; CHECK-FORCEALL-NEXT: cmpl %r8d, %edx
+; CHECK-FORCEALL-NEXT: movl %ecx, %r8d
+; CHECK-FORCEALL-NEXT: cmpl %ecx, %edx
; CHECK-FORCEALL-NEXT: ja .LBB6_4
; CHECK-FORCEALL-NEXT: # %bb.3: # %while.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB6_2 Depth=1
; CHECK-FORCEALL-NEXT: movl $22, %eax
-; CHECK-FORCEALL-NEXT: movl $22, %ecx
+; CHECK-FORCEALL-NEXT: movl $22, %r8d
; CHECK-FORCEALL-NEXT: .LBB6_4: # %while.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB6_2 Depth=1
; CHECK-FORCEALL-NEXT: xorl %edx, %edx
-; CHECK-FORCEALL-NEXT: divl %ecx
+; CHECK-FORCEALL-NEXT: divl %r8d
; CHECK-FORCEALL-NEXT: movl %edx, (%rdi,%rsi,4)
; CHECK-FORCEALL-NEXT: addl $1, %esi
; CHECK-FORCEALL-NEXT: cmpl %r9d, %esi
@@ -762,12 +762,12 @@ define i32 @test_cmov_memoperand_in_group(i32 %a, i32 %b, i32 %x, ptr %y.ptr) #0
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: ja .LBB9_2
; CHECK-NEXT: # %bb.1: # %entry
-; CHECK-NEXT: movl (%rcx), %r8d
+; CHECK-NEXT: movl (%rcx), %edx
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: movl %esi, %edx
+; CHECK-NEXT: movl %esi, %r8d
; CHECK-NEXT: .LBB9_2: # %entry
-; CHECK-NEXT: addl %r8d, %eax
; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: addl %r8d, %eax
; CHECK-NEXT: retq
;
; CHECK-FORCEALL-LABEL: test_cmov_memoperand_in_group:
@@ -777,12 +777,12 @@ define i32 @test_cmov_memoperand_in_group(i32 %a, i32 %b, i32 %x, ptr %y.ptr) #0
; CHECK-FORCEALL-NEXT: cmpl %esi, %edi
; CHECK-FORCEALL-NEXT: ja .LBB9_2
; CHECK-FORCEALL-NEXT: # %bb.1: # %entry
-; CHECK-FORCEALL-NEXT: movl (%rcx), %r8d
+; CHECK-FORCEALL-NEXT: movl (%rcx), %edx
; CHECK-FORCEALL-NEXT: movl %edi, %eax
-; CHECK-FORCEALL-NEXT: movl %esi, %edx
+; CHECK-FORCEALL-NEXT: movl %esi, %r8d
; CHECK-FORCEALL-NEXT: .LBB9_2: # %entry
-; CHECK-FORCEALL-NEXT: addl %r8d, %eax
; CHECK-FORCEALL-NEXT: addl %edx, %eax
+; CHECK-FORCEALL-NEXT: addl %r8d, %eax
; CHECK-FORCEALL-NEXT: retq
entry:
%cond = icmp ugt i32 %a, %b
@@ -804,12 +804,12 @@ define i32 @test_cmov_memoperand_in_group2(i32 %a, i32 %b, i32 %x, ptr %y.ptr) #
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: jbe .LBB10_2
; CHECK-NEXT: # %bb.1: # %entry
-; CHECK-NEXT: movl (%rcx), %r8d
+; CHECK-NEXT: movl (%rcx), %edx
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: movl %esi, %edx
+; CHECK-NEXT: movl %esi, %r8d
; CHECK-NEXT: .LBB10_2: # %entry
-; CHECK-NEXT: addl %r8d, %eax
; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: addl %r8d, %eax
; CHECK-NEXT: retq
;
; CHECK-FORCEALL-LABEL: test_cmov_memoperand_in_group2:
@@ -819,12 +819,12 @@ define i32 @test_cmov_memoperand_in_group2(i32 %a, i32 %b, i32 %x, ptr %y.ptr) #
; CHECK-FORCEALL-NEXT: cmpl %esi, %edi
; CHECK-FORCEALL-NEXT: jbe .LBB10_2
; CHECK-FORCEALL-NEXT: # %bb.1: # %entry
-; CHECK-FORCEALL-NEXT: movl (%rcx), %r8d
+; CHECK-FORCEALL-NEXT: movl (%rcx), %edx
; CHECK-FORCEALL-NEXT: movl %edi, %eax
-; CHECK-FORCEALL-NEXT: movl %esi, %edx
+; CHECK-FORCEALL-NEXT: movl %esi, %r8d
; CHECK-FORCEALL-NEXT: .LBB10_2: # %entry
-; CHECK-FORCEALL-NEXT: addl %r8d, %eax
; CHECK-FORCEALL-NEXT: addl %edx, %eax
+; CHECK-FORCEALL-NEXT: addl %r8d, %eax
; CHECK-FORCEALL-NEXT: retq
entry:
%cond = icmp ugt i32 %a, %b
@@ -969,32 +969,32 @@ entry:
define void @test_memoperand_loop(i32 %data) #0 {
; CHECK-LABEL: test_memoperand_loop:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq begin at GOTPCREL(%rip), %r8
-; CHECK-NEXT: movq (%r8), %rax
-; CHECK-NEXT: movq end at GOTPCREL(%rip), %rcx
-; CHECK-NEXT: movq (%rcx), %rdx
+; CHECK-NEXT: movq begin at GOTPCREL(%rip), %rax
+; CHECK-NEXT: movq (%rax), %rcx
+; CHECK-NEXT: movq end at GOTPCREL(%rip), %rdx
+; CHECK-NEXT: movq (%rdx), %rdx
; CHECK-NEXT: xorl %esi, %esi
-; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: movq %rcx, %r8
; CHECK-NEXT: .LBB15_1: # %loop.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: addq $8, %rcx
-; CHECK-NEXT: cmpq %rdx, %rcx
+; CHECK-NEXT: addq $8, %r8
+; CHECK-NEXT: cmpq %rdx, %r8
; CHECK-NEXT: ja .LBB15_3
; CHECK-NEXT: # %bb.2: # %loop.body
; CHECK-NEXT: # in Loop: Header=BB15_1 Depth=1
-; CHECK-NEXT: movq (%r8), %rcx
+; CHECK-NEXT: movq (%rax), %r8
; CHECK-NEXT: .LBB15_3: # %loop.body
; CHECK-NEXT: # in Loop: Header=BB15_1 Depth=1
-; CHECK-NEXT: movl %edi, (%rcx)
-; CHECK-NEXT: addq $8, %rcx
-; CHECK-NEXT: cmpq %rdx, %rcx
+; CHECK-NEXT: movl %edi, (%r8)
+; CHECK-NEXT: addq $8, %r8
+; CHECK-NEXT: cmpq %rdx, %r8
; CHECK-NEXT: ja .LBB15_5
; CHECK-NEXT: # %bb.4: # %loop.body
; CHECK-NEXT: # in Loop: Header=BB15_1 Depth=1
-; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: movq %rcx, %r8
; CHECK-NEXT: .LBB15_5: # %loop.body
; CHECK-NEXT: # in Loop: Header=BB15_1 Depth=1
-; CHECK-NEXT: movl %edi, (%rcx)
+; CHECK-NEXT: movl %edi, (%r8)
; CHECK-NEXT: addl $1, %esi
; CHECK-NEXT: cmpl $1024, %esi # imm = 0x400
; CHECK-NEXT: jl .LBB15_1
@@ -1003,32 +1003,32 @@ define void @test_memoperand_loop(i32 %data) #0 {
;
; CHECK-FORCEALL-LABEL: test_memoperand_loop:
; CHECK-FORCEALL: # %bb.0: # %entry
-; CHECK-FORCEALL-NEXT: movq begin at GOTPCREL(%rip), %r8
-; CHECK-FORCEALL-NEXT: movq (%r8), %rax
-; CHECK-FORCEALL-NEXT: movq end at GOTPCREL(%rip), %rcx
-; CHECK-FORCEALL-NEXT: movq (%rcx), %rdx
+; CHECK-FORCEALL-NEXT: movq begin at GOTPCREL(%rip), %rax
+; CHECK-FORCEALL-NEXT: movq (%rax), %rcx
+; CHECK-FORCEALL-NEXT: movq end at GOTPCREL(%rip), %rdx
+; CHECK-FORCEALL-NEXT: movq (%rdx), %rdx
; CHECK-FORCEALL-NEXT: xorl %esi, %esi
-; CHECK-FORCEALL-NEXT: movq %rax, %rcx
+; CHECK-FORCEALL-NEXT: movq %rcx, %r8
; CHECK-FORCEALL-NEXT: .LBB15_1: # %loop.body
; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-FORCEALL-NEXT: addq $8, %rcx
-; CHECK-FORCEALL-NEXT: cmpq %rdx, %rcx
+; CHECK-FORCEALL-NEXT: addq $8, %r8
+; CHECK-FORCEALL-NEXT: cmpq %rdx, %r8
; CHECK-FORCEALL-NEXT: ja .LBB15_3
; CHECK-FORCEALL-NEXT: # %bb.2: # %loop.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB15_1 Depth=1
-; CHECK-FORCEALL-NEXT: movq (%r8), %rcx
+; CHECK-FORCEALL-NEXT: movq (%rax), %r8
; CHECK-FORCEALL-NEXT: .LBB15_3: # %loop.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB15_1 Depth=1
-; CHECK-FORCEALL-NEXT: movl %edi, (%rcx)
-; CHECK-FORCEALL-NEXT: addq $8, %rcx
-; CHECK-FORCEALL-NEXT: cmpq %rdx, %rcx
+; CHECK-FORCEALL-NEXT: movl %edi, (%r8)
+; CHECK-FORCEALL-NEXT: addq $8, %r8
+; CHECK-FORCEALL-NEXT: cmpq %rdx, %r8
; CHECK-FORCEALL-NEXT: ja .LBB15_5
; CHECK-FORCEALL-NEXT: # %bb.4: # %loop.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB15_1 Depth=1
-; CHECK-FORCEALL-NEXT: movq %rax, %rcx
+; CHECK-FORCEALL-NEXT: movq %rcx, %r8
; CHECK-FORCEALL-NEXT: .LBB15_5: # %loop.body
; CHECK-FORCEALL-NEXT: # in Loop: Header=BB15_1 Depth=1
-; CHECK-FORCEALL-NEXT: movl %edi, (%rcx)
+; CHECK-FORCEALL-NEXT: movl %edi, (%r8)
; CHECK-FORCEALL-NEXT: addl $1, %esi
; CHECK-FORCEALL-NEXT: cmpl $1024, %esi # imm = 0x400
; CHECK-FORCEALL-NEXT: jl .LBB15_1
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 4518e4b270fe9..b7a2daf0615e5 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -249,33 +249,33 @@ define void @store_factori64_4(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64>
define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3, <32 x i8> %x4, ptr %p) nounwind {
; AVX1-LABEL: interleaved_store_vf32_i8_stride4:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX1-NEXT: vmovaps %ymm0, 96(%rdi)
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vmovaps %ymm1, 96(%rdi)
; AVX1-NEXT: vmovaps %ymm3, 64(%rdi)
-; AVX1-NEXT: vmovaps %ymm1, 32(%rdi)
+; AVX1-NEXT: vmovaps %ymm0, 32(%rdi)
; AVX1-NEXT: vmovaps %ymm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -529,7 +529,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm8
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vmovdqa (%rdi), %xmm1
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3
@@ -548,21 +548,21 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm6
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm0
+; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm8
; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm7, %xmm4, %xmm4
+; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
-; AVX512-NEXT: vpcmpeqb %zmm5, %zmm8, %k0
-; AVX512-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; AVX512-NEXT: vpcmpeqb %zmm5, %zmm0, %k0
+; AVX512-NEXT: vpcmpeqb %zmm1, %zmm6, %k1
; AVX512-NEXT: kxnorw %k1, %k0, %k0
; AVX512-NEXT: vpmovm2b %k0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -585,84 +585,82 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-LABEL: interleaved_load_vf32_i8_stride4:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vmovdqa (%rdi), %xmm10
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm11
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm13
-; AVX1-NEXT: vpshufb %xmm6, %xmm13, %xmm4
-; AVX1-NEXT: vpshufb %xmm6, %xmm12, %xmm5
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm4
+; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm5
-; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm5
+; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm7
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vmovdqa 112(%rdi), %xmm14
-; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm7
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vmovdqa 112(%rdi), %xmm4
+; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm7
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm5
; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm6
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX1-NEXT: vmovdqa 80(%rdi), %xmm6
-; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm2
+; AVX1-NEXT: vpshufb %xmm8, %xmm6, %xmm11
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm7
-; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm1, %xmm13, %xmm2
-; AVX1-NEXT: vpshufb %xmm1, %xmm12, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm11, %xmm3
-; AVX1-NEXT: vpshufb %xmm2, %xmm10, %xmm4
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpcmpeqb %xmm0, %xmm8, %xmm0
-; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpshufb %xmm1, %xmm14, %xmm0
-; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpcmpeqb %xmm0, %xmm9, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm1
-; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm11, %xmm3
-; AVX1-NEXT: vpshufb %xmm2, %xmm10, %xmm4
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm3
-; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm3
-; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm8
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2,3],xmm10[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm8
+; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm12
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
+; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4,5,6,7]
+; AVX1-NEXT: vpcmpeqb %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm9
+; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm11
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; AVX1-NEXT: vpshufb %xmm12, %xmm6, %xmm11
+; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm12
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
+; AVX1-NEXT: vpcmpeqb %xmm9, %xmm10, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm11
+; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm12
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
+; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm13
+; AVX1-NEXT: vpshufb %xmm10, %xmm5, %xmm10
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
+; AVX1-NEXT: vpshufb %xmm12, %xmm6, %xmm13
+; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm12
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm12, %xmm2, %xmm2
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm3
-; AVX1-NEXT: vpshufb %xmm2, %xmm12, %xmm4
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm0
-; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm1
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpcmpeqb %xmm0, %xmm8, %xmm0
-; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm11, %xmm0
+; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm1
+; AVX1-NEXT: vpshufb %xmm12, %xmm5, %xmm2
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm2
-; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpcmpeqb %xmm1, %xmm15, %xmm1
-; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm10, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm2
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -670,8 +668,8 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
;
; AVX2-LABEL: interleaved_load_vf32_i8_stride4:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm10
+; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
; AVX2-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
@@ -681,69 +679,69 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm0
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8
; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %ymm7, %ymm10, %ymm9
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm9
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184]
; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
-; AVX2-NEXT: vpshufb %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm0
-; AVX2-NEXT: vpshufb %xmm7, %xmm4, %xmm7
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm1
-; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %ymm1, %ymm10, %ymm7
-; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7
-; AVX2-NEXT: vpshufb %ymm1, %ymm8, %ymm1
-; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpcmpeqb %ymm0, %ymm9, %ymm9
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm1
-; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm7
-; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %ymm1, %ymm10, %ymm7
-; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7
-; AVX2-NEXT: vpshufb %ymm1, %ymm8, %ymm1
-; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %xmm1, %xmm4, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm8
+; AVX2-NEXT: vpermd %ymm8, %ymm6, %ymm8
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
+; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
+; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
+; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
+; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
+; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
+; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
+; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
+; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
+; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
+; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %ymm2, %ymm10, %ymm3
-; AVX2-NEXT: vpermd %ymm3, %ymm6, %ymm3
-; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm2
-; AVX2-NEXT: vpermd %ymm2, %ymm6, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm0, %ymm9, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm7, %ymm0
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -1008,36 +1006,36 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x
; AVX1-LABEL: interleaved_store_vf32_i8_stride3:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
-; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
+; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqu %xmm3, 80(%rdi)
-; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi)
-; AVX1-NEXT: vmovdqu %xmm5, 48(%rdi)
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vmovdqu %xmm5, 80(%rdi)
+; AVX1-NEXT: vmovdqu %xmm3, 64(%rdi)
+; AVX1-NEXT: vmovdqu %xmm4, 48(%rdi)
; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi)
-; AVX1-NEXT: vmovdqu %xmm6, 16(%rdi)
+; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi)
; AVX1-NEXT: vmovdqu %xmm1, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1097,118 +1095,109 @@ ret void
define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, ptr %p) nounwind {
; AVX1-LABEL: interleaved_store_vf64_i8_stride3:
; AVX1: # %bb.0:
-; AVX1-NEXT: subq $88, %rsp
+; AVX1-NEXT: pushq %rax
; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vmovdqa %ymm3, %ymm11
-; AVX1-NEXT: vmovdqa %ymm2, %ymm12
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm13
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10>
-; AVX1-NEXT: vpshufb %xmm5, %xmm13, %xmm8
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128>
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm15
-; AVX1-NEXT: vpshufb %xmm2, %xmm15, %xmm6
-; AVX1-NEXT: vpor %xmm6, %xmm8, %xmm3
-; AVX1-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm9
-; AVX1-NEXT: vpshufb %xmm2, %xmm11, %xmm6
-; AVX1-NEXT: vpor %xmm6, %xmm9, %xmm3
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm5, %xmm10, %xmm14
-; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm6
-; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm7
-; AVX1-NEXT: vpor %xmm7, %xmm14, %xmm3
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128]
-; AVX1-NEXT: vpshufb %xmm7, %xmm10, %xmm10
-; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm3
-; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm13
-; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm4
-; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm2, %xmm12, %xmm7
-; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15>
-; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm2
+; AVX1-NEXT: vmovdqa %ymm4, %ymm5
+; AVX1-NEXT: vmovdqa %ymm2, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10>
+; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128>
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm12
+; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm7
+; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm2
; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vmovdqa %ymm1, %ymm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
-; AVX1-NEXT: vpshufb %xmm0, %xmm8, %xmm1
-; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm11[8],xmm8[9],xmm11[9],xmm8[10],xmm11[10],xmm8[11],xmm11[11],xmm8[12],xmm11[12],xmm8[13],xmm11[13],xmm8[14],xmm11[14],xmm8[15],xmm11[15]
-; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm9
-; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15]
-; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm0
-; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm5
-; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
-; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4]
-; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
-; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4]
-; AVX1-NEXT: vpor %xmm5, %xmm10, %xmm5
-; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
-; AVX1-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm6
-; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4]
+; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm7
+; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm13
+; AVX1-NEXT: vpor %xmm7, %xmm13, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb %xmm10, %xmm8, %xmm13
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm14
+; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm15
+; AVX1-NEXT: vpor %xmm13, %xmm15, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm15, %xmm8, %xmm8
+; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm2
+; AVX1-NEXT: vpshufb %xmm15, %xmm9, %xmm9
+; AVX1-NEXT: vpshufb %xmm15, %xmm0, %xmm15
+; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm10
+; AVX1-NEXT: vpor %xmm0, %xmm10, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm9 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm9 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4]
-; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
-; AVX1-NEXT: # xmm3 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128]
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm11
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm5
-; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm10
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm7
-; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm6
-; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm12
-; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX1-NEXT: vpshufb %xmm1, %xmm14, %xmm4
-; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpshufb %xmm1, %xmm15, %xmm7
-; AVX1-NEXT: vpshufb %xmm1, %xmm9, %xmm2
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm6
-; AVX1-NEXT: vpshufb %xmm1, %xmm13, %xmm8
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm9
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vmovdqu %xmm2, 80(%rdi)
-; AVX1-NEXT: vmovdqu %xmm10, 64(%rdi)
-; AVX1-NEXT: vmovdqu %xmm11, 16(%rdi)
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15>
+; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15]
+; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm10
+; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
+; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm13
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15]
+; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm6
+; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
+; AVX1-NEXT: vpor %xmm11, %xmm9, %xmm11
+; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4]
+; AVX1-NEXT: vpor %xmm2, %xmm12, %xmm12
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm12
+; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4]
+; AVX1-NEXT: vpor %xmm14, %xmm15, %xmm14
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
+; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
+; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm8 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128]
+; AVX1-NEXT: vpshufb %xmm13, %xmm5, %xmm5
+; AVX1-NEXT: vpor %xmm5, %xmm10, %xmm5
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10
+; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm10, %xmm0
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10
+; AVX1-NEXT: vpshufb %xmm13, %xmm7, %xmm14
+; AVX1-NEXT: vpor %xmm14, %xmm10, %xmm10
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm9
+; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm9, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX1-NEXT: vpshufb %xmm9, %xmm12, %xmm12
+; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm7
+; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm2
+; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm11
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6
+; AVX1-NEXT: vmovdqu %xmm7, 80(%rdi)
+; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi)
+; AVX1-NEXT: vmovdqu %xmm5, 16(%rdi)
; AVX1-NEXT: vmovdqu %xmm4, (%rdi)
-; AVX1-NEXT: vmovdqu %xmm7, 48(%rdi)
-; AVX1-NEXT: vmovdqu %xmm3, 32(%rdi)
-; AVX1-NEXT: vmovdqu %xmm1, 176(%rdi)
-; AVX1-NEXT: vmovdqu %xmm0, 160(%rdi)
-; AVX1-NEXT: vmovdqu %xmm12, 112(%rdi)
-; AVX1-NEXT: vmovdqu %xmm6, 96(%rdi)
-; AVX1-NEXT: vmovdqu %xmm9, 144(%rdi)
-; AVX1-NEXT: vmovdqu %xmm8, 128(%rdi)
-; AVX1-NEXT: addq $88, %rsp
+; AVX1-NEXT: vmovdqu %xmm12, 48(%rdi)
+; AVX1-NEXT: vmovdqu %xmm8, 32(%rdi)
+; AVX1-NEXT: vmovdqu %xmm6, 176(%rdi)
+; AVX1-NEXT: vmovdqu %xmm1, 160(%rdi)
+; AVX1-NEXT: vmovdqu %xmm10, 112(%rdi)
+; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi)
+; AVX1-NEXT: vmovdqu %xmm11, 144(%rdi)
+; AVX1-NEXT: vmovdqu %xmm2, 128(%rdi)
+; AVX1-NEXT: popq %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1307,125 +1296,116 @@ ret void
define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
; AVX1: # %bb.0:
-; AVX1-NEXT: subq $40, %rsp
-; AVX1-NEXT: .cfi_def_cfa_offset 48
-; AVX1-NEXT: vmovdqu (%rdi), %xmm9
-; AVX1-NEXT: vmovdqu 16(%rdi), %xmm11
+; AVX1-NEXT: vmovdqu (%rdi), %xmm8
+; AVX1-NEXT: vmovups 16(%rdi), %xmm0
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovdqu 48(%rdi), %xmm10
-; AVX1-NEXT: vmovdqu 64(%rdi), %xmm15
-; AVX1-NEXT: vmovdqu 80(%rdi), %xmm14
-; AVX1-NEXT: vmovdqu 96(%rdi), %xmm3
-; AVX1-NEXT: vmovdqu 112(%rdi), %xmm1
-; AVX1-NEXT: vmovdqu 144(%rdi), %xmm6
-; AVX1-NEXT: vmovdqu 160(%rdi), %xmm12
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm0
+; AVX1-NEXT: vmovdqu 64(%rdi), %xmm3
+; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4
+; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5
+; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2
+; AVX1-NEXT: vmovdqu 144(%rdi), %xmm12
+; AVX1-NEXT: vmovdqu 160(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
+; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm6
+; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm7
+; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm9
+; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm11
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm13, %xmm5, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm15
+; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpor %xmm5, %xmm15, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm13
-; AVX1-NEXT: vpshufb %xmm2, %xmm9, %xmm5
-; AVX1-NEXT: vpshufb %xmm2, %xmm10, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm7
-; AVX1-NEXT: vmovdqa %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm1
+; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm12
+; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15
+; AVX1-NEXT: vmovdqa %xmm1, %xmm0
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX1-NEXT: vpshufb %xmm8, %xmm12, %xmm7
-; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm1
+; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm7
-; AVX1-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm8, %xmm11, %xmm3
-; AVX1-NEXT: vpor %xmm7, %xmm3, %xmm1
-; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm1
-; AVX1-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm8, %xmm15, %xmm7
-; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1
-; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm7
-; AVX1-NEXT: vpshufb %xmm8, %xmm14, %xmm6
-; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm15
-; AVX1-NEXT: vmovdqu 32(%rdi), %xmm7
-; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm9
-; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm10
-; AVX1-NEXT: vpor %xmm9, %xmm10, %xmm10
-; AVX1-NEXT: vmovdqu 176(%rdi), %xmm9
-; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm1
-; AVX1-NEXT: vpshufb %xmm8, %xmm9, %xmm11
-; AVX1-NEXT: vpor %xmm1, %xmm11, %xmm11
-; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqu 128(%rdi), %xmm1
-; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm3
-; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm13, %xmm8, %xmm8
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15
+; AVX1-NEXT: vpor %xmm8, %xmm15, %xmm5
+; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb %xmm13, %xmm10, %xmm8
+; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm10
+; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm10
+; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm8
+; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm5
+; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vmovdqu 32(%rdi), %xmm8
+; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm3
+; AVX1-NEXT: vpshufb %xmm14, %xmm8, %xmm12
+; AVX1-NEXT: vpor %xmm3, %xmm12, %xmm3
+; AVX1-NEXT: vmovdqu 176(%rdi), %xmm12
+; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb %xmm14, %xmm12, %xmm15
+; AVX1-NEXT: vpor %xmm1, %xmm15, %xmm1
+; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm13
+; AVX1-NEXT: vmovdqu 128(%rdi), %xmm15
+; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm14
+; AVX1-NEXT: vpor %xmm13, %xmm14, %xmm14
; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128]
-; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm6
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm6
-; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm6
-; AVX1-NEXT: vpor %xmm6, %xmm13, %xmm12
-; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm3
-; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm11
-; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm0
-; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm10
-; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
-; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm4
-; AVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm4
-; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm6, %xmm9, %xmm5
-; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddb %xmm4, %xmm13, %xmm4
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm8, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT: vpshufb %xmm3, %xmm10, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm7
-; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpshufb %xmm3, %xmm11, %xmm5
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm7
-; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm13
+; AVX1-NEXT: vpor %xmm6, %xmm13, %xmm13
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm14
+; AVX1-NEXT: vpor %xmm7, %xmm14, %xmm14
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpshufb %xmm0, %xmm8, %xmm7
+; AVX1-NEXT: vpor %xmm7, %xmm9, %xmm7
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm11, %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm11[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
+; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4
+; AVX1-NEXT: vpaddb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5
+; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm8
+; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpshufb %xmm3, %xmm12, %xmm5
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm7
-; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5
+; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm8
+; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpaddb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5
+; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm8
+; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
+; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm3
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm3
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpshufb %xmm6, %xmm13, %xmm3
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT: addq $40, %rsp
-; AVX1-NEXT: .cfi_def_cfa_offset 8
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
@@ -1538,54 +1518,54 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm11
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm13
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm13
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX1-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm14, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3
; AVX1-NEXT: vmovaps %ymm3, 224(%rdi)
; AVX1-NEXT: vmovaps %ymm0, 192(%rdi)
-; AVX1-NEXT: vmovaps %ymm7, 160(%rdi)
+; AVX1-NEXT: vmovaps %ymm1, 160(%rdi)
; AVX1-NEXT: vmovaps %ymm6, 128(%rdi)
-; AVX1-NEXT: vmovaps %ymm1, 96(%rdi)
-; AVX1-NEXT: vmovaps %ymm2, 64(%rdi)
+; AVX1-NEXT: vmovaps %ymm2, 96(%rdi)
+; AVX1-NEXT: vmovaps %ymm9, 64(%rdi)
; AVX1-NEXT: vmovaps %ymm4, 32(%rdi)
; AVX1-NEXT: vmovaps %ymm8, (%rdi)
; AVX1-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/zext-sext.ll b/llvm/test/CodeGen/X86/zext-sext.ll
index 5140587ef2775..25929ecbde76f 100644
--- a/llvm/test/CodeGen/X86/zext-sext.ll
+++ b/llvm/test/CodeGen/X86/zext-sext.ll
@@ -19,15 +19,15 @@ define void @func(ptr %a, ptr %b, ptr %c, ptr %d) nounwind {
; CHECK-NEXT: imull %edx, %eax
; CHECK-NEXT: addl $2138875574, %eax # imm = 0x7F7CA6B6
; CHECK-NEXT: cmpl $2138875574, %eax # imm = 0x7F7CA6B6
-; CHECK-NEXT: setl %sil
+; CHECK-NEXT: setl %dl
; CHECK-NEXT: cmpl $-8608074, %eax # imm = 0xFF7CA6B6
-; CHECK-NEXT: setge %dl
-; CHECK-NEXT: andb %sil, %dl
+; CHECK-NEXT: setge %sil
+; CHECK-NEXT: andb %dl, %sil
+; CHECK-NEXT: movzbl %sil, %edx
; CHECK-NEXT: movslq %eax, %rsi
-; CHECK-NEXT: movzbl %dl, %edx
; CHECK-NEXT: movq %rsi, %rdi
-; CHECK-NEXT: subq %rax, %rdi
; CHECK-NEXT: negl %edx
+; CHECK-NEXT: subq %rax, %rdi
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testl $-2, %edx
; CHECK-NEXT: cmovneq %rax, %rdi
diff --git a/llvm/test/CodeGen/X86/znver3-gather.ll b/llvm/test/CodeGen/X86/znver3-gather.ll
index b908cdef36255..5a2721ca1f2f4 100644
--- a/llvm/test/CodeGen/X86/znver3-gather.ll
+++ b/llvm/test/CodeGen/X86/znver3-gather.ll
@@ -11,26 +11,26 @@ define <8 x i32> @simple(ptr %base, <8 x i32> %offsets) {
; X64-NEXT: vpmovsxdq %xmm2, %ymm2
; X64-NEXT: vpsllq $2, %ymm0, %ymm0
; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; X64-NEXT: vmovq %xmm0, %r8
-; X64-NEXT: vpextrq $1, %xmm0, %r9
+; X64-NEXT: vmovq %xmm0, %rax
+; X64-NEXT: vpextrq $1, %xmm0, %rcx
; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
; X64-NEXT: vpsllq $2, %ymm2, %ymm2
; X64-NEXT: vpaddq %ymm2, %ymm1, %ymm2
; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: vpextrq $1, %xmm0, %r10
+; X64-NEXT: vpextrq $1, %xmm0, %rdx
; X64-NEXT: vmovq %xmm0, %rsi
; X64-NEXT: vextracti128 $1, %ymm2, %xmm0
; X64-NEXT: vmovq %xmm2, %rdi
-; X64-NEXT: vpextrq $1, %xmm2, %rax
-; X64-NEXT: vpinsrd $1, (%r9), %xmm1, %xmm1
-; X64-NEXT: vmovq %xmm0, %rcx
-; X64-NEXT: vpextrq $1, %xmm0, %rdx
+; X64-NEXT: vpextrq $1, %xmm2, %r8
+; X64-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1
+; X64-NEXT: vmovq %xmm0, %r9
+; X64-NEXT: vpextrq $1, %xmm0, %r10
; X64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vpinsrd $2, (%rsi), %xmm1, %xmm1
-; X64-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
-; X64-NEXT: vpinsrd $3, (%r10), %xmm1, %xmm1
-; X64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0
-; X64-NEXT: vpinsrd $3, (%rdx), %xmm0, %xmm0
+; X64-NEXT: vpinsrd $1, (%r8), %xmm0, %xmm0
+; X64-NEXT: vpinsrd $3, (%rdx), %xmm1, %xmm1
+; X64-NEXT: vpinsrd $2, (%r9), %xmm0, %xmm0
+; X64-NEXT: vpinsrd $3, (%r10), %xmm0, %xmm0
; X64-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
%ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %offsets
@@ -48,26 +48,26 @@ define <8 x i32> @optsize(ptr %base, <8 x i32> %offsets) optsize {
; X64-NEXT: vpmovsxdq %xmm2, %ymm2
; X64-NEXT: vpsllq $2, %ymm0, %ymm0
; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; X64-NEXT: vmovq %xmm0, %r8
-; X64-NEXT: vpextrq $1, %xmm0, %r9
+; X64-NEXT: vmovq %xmm0, %rax
+; X64-NEXT: vpextrq $1, %xmm0, %rcx
; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
; X64-NEXT: vpsllq $2, %ymm2, %ymm2
; X64-NEXT: vpaddq %ymm2, %ymm1, %ymm2
; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: vpextrq $1, %xmm0, %r10
+; X64-NEXT: vpextrq $1, %xmm0, %rdx
; X64-NEXT: vmovq %xmm0, %rsi
; X64-NEXT: vextracti128 $1, %ymm2, %xmm0
; X64-NEXT: vmovq %xmm2, %rdi
-; X64-NEXT: vpextrq $1, %xmm2, %rax
-; X64-NEXT: vpinsrd $1, (%r9), %xmm1, %xmm1
-; X64-NEXT: vmovq %xmm0, %rcx
-; X64-NEXT: vpextrq $1, %xmm0, %rdx
+; X64-NEXT: vpextrq $1, %xmm2, %r8
+; X64-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1
+; X64-NEXT: vmovq %xmm0, %r9
+; X64-NEXT: vpextrq $1, %xmm0, %r10
; X64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vpinsrd $2, (%rsi), %xmm1, %xmm1
-; X64-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
-; X64-NEXT: vpinsrd $3, (%r10), %xmm1, %xmm1
-; X64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0
-; X64-NEXT: vpinsrd $3, (%rdx), %xmm0, %xmm0
+; X64-NEXT: vpinsrd $1, (%r8), %xmm0, %xmm0
+; X64-NEXT: vpinsrd $3, (%rdx), %xmm1, %xmm1
+; X64-NEXT: vpinsrd $2, (%r9), %xmm0, %xmm0
+; X64-NEXT: vpinsrd $3, (%r10), %xmm0, %xmm0
; X64-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
%ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %offsets
@@ -85,26 +85,26 @@ define <8 x i32> @minsize(ptr %base, <8 x i32> %offsets) minsize {
; X64-NEXT: vpmovsxdq %xmm2, %ymm2
; X64-NEXT: vpsllq $2, %ymm0, %ymm0
; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; X64-NEXT: vmovq %xmm0, %r8
-; X64-NEXT: vpextrq $1, %xmm0, %r9
+; X64-NEXT: vmovq %xmm0, %rax
+; X64-NEXT: vpextrq $1, %xmm0, %rcx
; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
; X64-NEXT: vpsllq $2, %ymm2, %ymm2
; X64-NEXT: vpaddq %ymm2, %ymm1, %ymm2
; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: vpextrq $1, %xmm0, %r10
+; X64-NEXT: vpextrq $1, %xmm0, %rdx
; X64-NEXT: vmovq %xmm0, %rsi
; X64-NEXT: vextracti128 $1, %ymm2, %xmm0
; X64-NEXT: vmovq %xmm2, %rdi
-; X64-NEXT: vpextrq $1, %xmm2, %rax
-; X64-NEXT: vpinsrd $1, (%r9), %xmm1, %xmm1
-; X64-NEXT: vmovq %xmm0, %rcx
-; X64-NEXT: vpextrq $1, %xmm0, %rdx
+; X64-NEXT: vpextrq $1, %xmm2, %r8
+; X64-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1
+; X64-NEXT: vmovq %xmm0, %r9
+; X64-NEXT: vpextrq $1, %xmm0, %r10
; X64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vpinsrd $2, (%rsi), %xmm1, %xmm1
-; X64-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
-; X64-NEXT: vpinsrd $3, (%r10), %xmm1, %xmm1
-; X64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0
-; X64-NEXT: vpinsrd $3, (%rdx), %xmm0, %xmm0
+; X64-NEXT: vpinsrd $1, (%r8), %xmm0, %xmm0
+; X64-NEXT: vpinsrd $3, (%rdx), %xmm1, %xmm1
+; X64-NEXT: vpinsrd $2, (%r9), %xmm0, %xmm0
+; X64-NEXT: vpinsrd $3, (%r10), %xmm0, %xmm0
; X64-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
%ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %offsets
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding.mir b/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding.mir
index 2c01180092bd8..ee004e0354fd4 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding.mir
@@ -1,7 +1,5 @@
# RUN: llc %s -o - -experimental-debug-variable-locations \
-# RUN: -start-before=x86-flags-copy-lowering -stop-after=virtregrewriter \
-# RUN: -mtriple x86_64-unknown-unknown \
-# RUN: | FileCheck %s
+# RUN: -run-pass=greedy,virtregrewriter | FileCheck %s
#
# This test is for stack spill folding -- the SETCC near the start of the MIR
# below show be morphed into an SETCCm by the register allocator, making it
@@ -19,9 +17,9 @@
# CHECK-LABEL: bb.0:
# CHECK: SETCCm %stack.0, {{.*}} debug-instr-number 2
--- |
- target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64--"
- define internal fastcc void @beans(i32 %Kind) unnamed_addr align 2 !dbg !7 {
+ define void @beans() !dbg !7 {
ret void
}
@@ -49,212 +47,18 @@
...
---
-name: beans
-alignment: 16
-tracksRegLiveness: true
-registers:
- - { id: 0, class: gr8 }
- - { id: 1, class: gr8 }
- - { id: 2, class: gr8 }
- - { id: 3, class: gr32 }
- - { id: 4, class: gr64 }
- - { id: 5, class: gr8 }
- - { id: 6, class: gr32 }
- - { id: 7, class: gr32 }
- - { id: 8, class: gr32 }
- - { id: 9, class: gr32 }
- - { id: 10, class: gr32 }
- - { id: 11, class: gr64 }
- - { id: 12, class: gr64 }
- - { id: 13, class: gr32 }
- - { id: 14, class: gr8 }
- - { id: 15, class: gr8 }
- - { id: 16, class: gr32 }
- - { id: 17, class: gr64 }
- - { id: 18, class: gr8 }
- - { id: 19, class: gr8 }
- - { id: 20, class: gr32 }
- - { id: 21, class: gr64 }
- - { id: 22, class: gr32 }
- - { id: 23, class: gr32 }
- - { id: 24, class: gr64 }
- - { id: 25, class: gr32 }
- - { id: 26, class: gr64 }
- - { id: 27, class: gr8 }
- - { id: 28, class: gr64_nosp }
- - { id: 29, class: gr32 }
- - { id: 30, class: gr8 }
- - { id: 31, class: gr8 }
- - { id: 32, class: gr64 }
- - { id: 33, class: gr8 }
- - { id: 34, class: gr64 }
- - { id: 35, class: gr64 }
- - { id: 36, class: gr8 }
- - { id: 37, class: gr64 }
- - { id: 38, class: gr8 }
- - { id: 39, class: gr32 }
- - { id: 40, class: gr32 }
- - { id: 41, class: gr8 }
- - { id: 42, class: gr8 }
- - { id: 43, class: gr8 }
-liveins:
- - { reg: '$edi', virtual-reg: '%7' }
-frameInfo:
- maxAlignment: 1
- hasCalls: true
-machineFunctionInfo: {}
-jumpTable:
- kind: block-address
- entries:
- - id: 0
- blocks: [ '%bb.11', '%bb.7', '%bb.11', '%bb.7', '%bb.12',
- '%bb.7', '%bb.7', '%bb.7', '%bb.7', '%bb.7', '%bb.7',
- '%bb.7', '%bb.7', '%bb.7', '%bb.7', '%bb.13',
- '%bb.7', '%bb.7', '%bb.7', '%bb.7', '%bb.7', '%bb.7',
- '%bb.7', '%bb.7', '%bb.7', '%bb.7', '%bb.7', '%bb.7',
- '%bb.7', '%bb.7', '%bb.7', '%bb.7', '%bb.7', '%bb.7',
- '%bb.7', '%bb.7', '%bb.7', '%bb.7', '%bb.7', '%bb.7',
- '%bb.8' ]
-body: |
+name: beans
+tracksRegLiveness: True
+body: |
bb.0:
- successors: %bb.2(0x20000000), %bb.14(0x60000000)
liveins: $edi
-
%7:gr32 = COPY $edi
CMP32ri8 %7, 4, implicit-def $eflags
%0:gr8 = SETCCr 4, implicit $eflags, debug-instr-number 1
- CMP32ri8 %7, 2, implicit-def $eflags
- %1:gr8 = SETCCr 4, implicit $eflags
- CMP32ri8 %7, 1, implicit-def $eflags
- %2:gr8 = SETCCr 4, implicit $eflags
- %11:gr64 = IMPLICIT_DEF
- %12:gr64 = IMPLICIT_DEF
- %5:gr8 = MOV8rm %12, 1, $noreg, 0, $noreg, debug-location !13 :: (load (s8) from `i8* undef`, align 8)
- %13:gr32 = MOV32r0 implicit-def dead $eflags
- %14:gr8 = COPY %13.sub_8bit
- TEST8rr %14, %14, implicit-def $eflags, debug-location !13
- JCC_1 %bb.2, 5, implicit $eflags, debug-location !13
- JMP_1 %bb.14, debug-location !13
-
- bb.14:
- successors: %bb.2(0x2aaaaaab), %bb.1(0x55555555)
-
- CMP8ri %5, 70, implicit-def $eflags, debug-location !13
- JCC_1 %bb.2, 5, implicit $eflags, debug-location !13
- JMP_1 %bb.1, debug-location !13
-
- bb.1:
-
- bb.2:
- successors: %bb.4(0x20000000), %bb.15(0x60000000)
-
- %4:gr64 = MOV64rm %11, 1, $noreg, 0, $noreg, debug-location !13 :: (load (s64) from `%"class.llvm::Instruction"** undef`)
- ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp, debug-location !13
- %16:gr32 = IMPLICIT_DEF
- $edi = COPY %16, debug-location !13
- %17:gr64 = IMPLICIT_DEF
- CALL64r killed %17, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rsp, implicit-def $ssp, implicit-def $al, debug-location !13
- ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp, debug-location !13
- %18:gr8 = COPY $al, debug-location !13
- TEST8ri %18, 1, implicit-def $eflags, debug-location !13
- JCC_1 %bb.4, 5, implicit $eflags, debug-location !13
- JMP_1 %bb.15, debug-location !13
-
- bb.15:
- successors: %bb.4(0x2aaaaaab), %bb.3(0x55555555)
-
- CMP8ri %5, 70, implicit-def $eflags, debug-location !13
- JCC_1 %bb.4, 4, implicit $eflags, debug-location !13
- JMP_1 %bb.3, debug-location !13
-
- bb.3:
-
- bb.4:
- successors: %bb.5, %bb.6
-
- %21:gr64 = IMPLICIT_DEF
- %20:gr32 = MOVZX32rm8 killed %21, 1, $noreg, 0, $noreg, debug-location !13 :: (load (s8) from `i32* undef`, align 8)
- %6:gr32 = nsw DEC32r %20, implicit-def dead $eflags, debug-location !13
- CMP32ri8 %6, 5, implicit-def $eflags, debug-location !13
- JCC_1 %bb.6, 7, implicit $eflags, debug-location !13
- JMP_1 %bb.5, debug-location !13
-
- bb.5:
- %24:gr64 = IMPLICIT_DEF
- %23:gr32 = MOVZX32rm8 %24, 1, $noreg, 0, $noreg, debug-location !13 :: (load (s8) from `i8* undef`, align 8)
- %25:gr32 = nsw ADD32ri8 %23, -22, implicit-def dead $eflags, debug-location !13
- ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp, debug-location !13
- $edi = COPY %25, debug-location !13
- %26:gr64 = IMPLICIT_DEF
- CALL64r %26, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rsp, implicit-def $ssp, implicit-def $al, debug-location !13
- ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp, debug-location !13
-
- bb.6:
- successors: %bb.7(0x0aaaaaab), %bb.16(0x75555555)
-
- %31:gr8 = IMPLICIT_DEF
- CMP8ri %31, 40, implicit-def $eflags, debug-location !13
- JCC_1 %bb.7, 7, implicit $eflags, debug-location !13
-
- bb.16:
- successors: %bb.11(0x2e8ba2ea), %bb.7(0x0ba2e8ba), %bb.12(0x1745d174), %bb.13(0x1745d174), %bb.8(0x1745d174)
-
- %29:gr32 = MOV32r0 implicit-def dead $eflags
- %28:gr64_nosp = SUBREG_TO_REG 0, %29, %subreg.sub_32bit
- JMP64m $noreg, 8, %28, %jump-table.0, $noreg :: (load (s64) from jump-table)
-
- bb.7:
- %43:gr8 = IMPLICIT_DEF
- $al = COPY %43
- RET 0, $al
-
- bb.8:
- successors: %bb.10(0x20000000), %bb.17(0x60000000)
-
- CMP32ri8 %6, 5, implicit-def $eflags, debug-location !13
- JCC_1 %bb.10, 7, implicit $eflags, debug-location !13
- JMP_1 %bb.17, debug-location !13
-
- bb.17:
- successors: %bb.10(0x2aaaaaab), %bb.9(0x55555555)
-
- %3:gr32 = ADD32ri8 %7, -7, implicit-def dead $eflags
- CMP32ri8 %3, 3, implicit-def $eflags, debug-location !13
- JCC_1 %bb.10, 2, implicit $eflags, debug-location !13
- JMP_1 %bb.9, debug-location !13
-
- bb.9:
- %41:gr8 = IMPLICIT_DEF
- $al = COPY %41
- RET 0, $al
-
- bb.10:
- %42:gr8 = IMPLICIT_DEF
- $al = COPY %42
- RET 0, $al
-
- bb.11:
- %37:gr64 = IMPLICIT_DEF
- MOV8mr killed %37, 1, $noreg, 0, $noreg, %2, debug-location !13 :: (store (s8) into `i8* undef`, align 8)
- %38:gr8 = IMPLICIT_DEF
- $al = COPY %38
- RET 0, $al
-
- bb.12:
- %34:gr64 = IMPLICIT_DEF
- MOV8mr %34, 1, $noreg, 0, $noreg, %1, debug-location !13 :: (store (s8) into `i8* undef`, align 8)
- %35:gr64 = IMPLICIT_DEF
- MOV64mr %35, 1, $noreg, 0, $noreg, %4, debug-location !13 :: (store (s64) into `%"class.llvm::Instruction"** undef`)
- %36:gr8 = IMPLICIT_DEF
- $al = COPY %36
- RET 0, $al
-
- bb.13:
- DBG_INSTR_REF 1, 0, !12, !DIExpression(), debug-location !13
- %32:gr64 = IMPLICIT_DEF
- MOV8mr killed %32, 1, $noreg, 0, $noreg, %0, debug-location !13 :: (store (s8) into `i8* undef`, align 8)
- %33:gr8 = IMPLICIT_DEF
- $al = COPY %33
- RET 0, $al
+ ; clobber all registers to force a spill/reload
+ INLINEASM &nop, 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $rax, 12 /* clobber */, implicit-def dead early-clobber $rbx, 12 /* clobber */, implicit-def dead early-clobber $rcx, 12 /* clobber */, implicit-def dead early-clobber $rdx, 12 /* clobber */, implicit-def dead early-clobber $rsi, 12 /* clobber */, implicit-def dead early-clobber $rdi, 12 /* clobber */, implicit-def dead early-clobber $rbp, 12 /* clobber */, implicit-def dead early-clobber $r8, 12 /* clobber */, implicit-def dead early-clobber $r9, 12 /* clobber */, implicit-def dead early-clobber $r10, 12 /* clobber */, implicit-def dead early-clobber $r11, 12 /* clobber */, implicit-def dead early-clobber $r12, 12 /* clobber */, implicit-def dead early-clobber $r13, 12 /* clobber */, implicit-def dead early-clobber $r14, 12 /* clobber */, implicit-def dead early-clobber $r15
+
+ $al = COPY %0
+ RET 0, $al
...
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/phi-coalescing.mir b/llvm/test/DebugInfo/MIR/InstrRef/phi-coalescing.mir
index 91f63937df1c9..9dd8a8ce2239d 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/phi-coalescing.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/phi-coalescing.mir
@@ -139,14 +139,14 @@ body: |
; Verify that the vreg is
diff erent immediately after register coalescing.
; DOESCOALESCE-NOT: %10:gr64 ADD64ri32
; DOESCOALESCE: %{{[0-9]+}}:gr64 = ADD64ri32
- ; Verify that all these registers land in rbx.
- ; CHECK: renamable $rbx = ADD64ri32 killed renamable $rbx
+ ; Verify that all these registers land in r14.
+ ; CHECK: renamable $r14 = ADD64ri32 killed renamable $r14
; DOESCOALESCE-LABEL: bb.2.if.end:
; CHECK-LABEL: bb.2.if.end:
bb.2.if.end:
%2:gr64 = PHI %9, %bb.0, %10, %bb.1, debug-instr-number 1, debug-location !13
- ; CHECK: DBG_PHI $rbx, 1
+ ; CHECK: DBG_PHI $r14, 1
DBG_INSTR_REF 1, 0, !12, !DIExpression(), debug-location !13
%14:gr64 = ADD64rr killed %2, %6, implicit-def $eflags, debug-location !13
ADJCALLSTACKDOWN64 0, 0, 0, implicit-def $rsp, implicit-def $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp, debug-location !13
diff --git a/llvm/test/DebugInfo/X86/live-debug-variables.ll b/llvm/test/DebugInfo/X86/live-debug-variables.ll
index d058d17d953d8..43cdd8b3a256c 100644
--- a/llvm/test/DebugInfo/X86/live-debug-variables.ll
+++ b/llvm/test/DebugInfo/X86/live-debug-variables.ll
@@ -27,7 +27,7 @@
; CHECK: DW_TAG_formal_parameter
; Check concrete entry has a single location.
; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_location (DW_OP_reg3 RBX)
+; CHECK-NEXT: DW_AT_location (DW_OP_reg6 RBP)
; CHECK-NEXT: DW_AT_abstract_origin
; CHECK-NOT: DW_TAG_formal_parameter
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
index b08c1eb862b6c..7944b52e1c0f1 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
@@ -23,7 +23,7 @@ define i32 @test(i8* %base) nounwind uwtable ssp {
; CHECK-NEXT: .cfi_offset %r13, -32
; CHECK-NEXT: .cfi_offset %r14, -24
; CHECK-NEXT: .cfi_offset %r15, -16
-; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: leaq 16(%rdi), %r15
; CHECK-NEXT: movl $16, %eax
; CHECK-NEXT: xorl %r12d, %r12d
@@ -31,23 +31,23 @@ define i32 @test(i8* %base) nounwind uwtable ssp {
; CHECK-NEXT: .LBB0_1: # %while.body.i
; CHECK-NEXT: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB0_2 Depth 2
-; CHECK-NEXT: movslq %r12d, %r13
+; CHECK-NEXT: movslq %r12d, %r14
; CHECK-NEXT: movq %rax, %r12
-; CHECK-NEXT: leaq (%r15,%r13), %rbx
-; CHECK-NEXT: addq $16, %r13
+; CHECK-NEXT: leaq (%r15,%r14), %r13
+; CHECK-NEXT: addq $16, %r14
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %for.body.i
; CHECK-NEXT: # Parent Loop BB0_1 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
; CHECK-NEXT: callq check at PLT
-; CHECK-NEXT: incq %rbx
+; CHECK-NEXT: incq %r13
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.3: # %for.end.i
; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: addq %r14, %r13
-; CHECK-NEXT: movq %r13, %rdi
-; CHECK-NEXT: movq %r13, %rsi
+; CHECK-NEXT: addq %rbx, %r14
+; CHECK-NEXT: movq %r14, %rdi
+; CHECK-NEXT: movq %r14, %rsi
; CHECK-NEXT: callq foo at PLT
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je .LBB0_5
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
index 8333750c0c6b6..02201126701ef 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -23,15 +23,15 @@ define i32 @simple(i32* %a, i32* %b, i32 %x) nounwind {
; X64-NEXT: .LBB0_1: # %loop
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: addl (%rdi), %eax
-; X64-NEXT: leaq (%rdi,%rcx), %r8
+; X64-NEXT: leaq (%rdi,%rcx), %rdx
; X64-NEXT: addl (%rdi,%rcx), %eax
-; X64-NEXT: leaq (%r8,%rcx), %rdx
-; X64-NEXT: addl (%rcx,%r8), %eax
+; X64-NEXT: leaq (%rdx,%rcx), %r8
; X64-NEXT: addl (%rcx,%rdx), %eax
-; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: cmpq %rsi, %rdx
+; X64-NEXT: addl (%rcx,%r8), %eax
+; X64-NEXT: addq %rcx, %r8
+; X64-NEXT: addq %rcx, %r8
+; X64-NEXT: movq %r8, %rdi
+; X64-NEXT: cmpq %rsi, %r8
; X64-NEXT: jne .LBB0_1
; X64-NEXT: # %bb.2: # %exit
; X64-NEXT: retq
@@ -182,43 +182,39 @@ exit:
define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind {
; X64-LABEL: extrastride:
; X64: # %bb.0: # %entry
-; X64-NEXT: pushq %rbp
-; X64-NEXT: pushq %r14
; X64-NEXT: pushq %rbx
; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: testl %r9d, %r9d
; X64-NEXT: je .LBB2_3
; X64-NEXT: # %bb.1: # %for.body.lr.ph
-; X64-NEXT: leal (%rsi,%rsi), %r14d
-; X64-NEXT: leal (%rsi,%rsi,2), %ebx
+; X64-NEXT: leal (%rsi,%rsi), %r10d
+; X64-NEXT: leal (%rsi,%rsi,2), %r11d
; X64-NEXT: addl %esi, %ecx
; X64-NEXT: leal (,%rsi,4), %eax
-; X64-NEXT: leal (%rcx,%rsi,4), %ebp
-; X64-NEXT: movslq %eax, %r10
-; X64-NEXT: movslq %ebx, %r11
-; X64-NEXT: movslq %r14d, %rbx
+; X64-NEXT: leal (%rcx,%rsi,4), %ebx
+; X64-NEXT: cltq
+; X64-NEXT: movslq %r11d, %rcx
+; X64-NEXT: movslq %r10d, %r10
; X64-NEXT: movslq %esi, %rsi
-; X64-NEXT: movslq %r8d, %rcx
-; X64-NEXT: shlq $2, %rcx
-; X64-NEXT: movslq %ebp, %rax
+; X64-NEXT: movslq %r8d, %r8
+; X64-NEXT: shlq $2, %r8
+; X64-NEXT: movslq %ebx, %r11
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB2_2: # %for.body
; X64-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NEXT: movl (%rdi,%rsi), %ebp
-; X64-NEXT: addl (%rdi), %ebp
-; X64-NEXT: addl (%rdi,%rbx), %ebp
-; X64-NEXT: addl (%rdi,%r11), %ebp
-; X64-NEXT: addl (%rdi,%r10), %ebp
-; X64-NEXT: movl %ebp, (%rdx)
-; X64-NEXT: addq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: movl (%rdi,%rsi), %ebx
+; X64-NEXT: addl (%rdi), %ebx
+; X64-NEXT: addl (%rdi,%r10), %ebx
+; X64-NEXT: addl (%rdi,%rcx), %ebx
+; X64-NEXT: addl (%rdi,%rax), %ebx
+; X64-NEXT: movl %ebx, (%rdx)
+; X64-NEXT: addq %r11, %rdi
+; X64-NEXT: addq %r8, %rdx
; X64-NEXT: decl %r9d
; X64-NEXT: jne .LBB2_2
; X64-NEXT: .LBB2_3: # %for.end
; X64-NEXT: popq %rbx
-; X64-NEXT: popq %r14
-; X64-NEXT: popq %rbp
; X64-NEXT: retq
;
; X32-LABEL: extrastride:
@@ -320,22 +316,22 @@ define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nou
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB3_1: # %for.body
; X64-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NEXT: movzbl -3(%rdi,%rax), %r8d
-; X64-NEXT: movzbl -3(%rsi,%rax), %ecx
-; X64-NEXT: addl %r8d, %ecx
-; X64-NEXT: movb %cl, -3(%rdx,%rax)
-; X64-NEXT: movzbl -2(%rdi,%rax), %r8d
-; X64-NEXT: movzbl -2(%rsi,%rax), %ecx
-; X64-NEXT: addl %r8d, %ecx
-; X64-NEXT: movb %cl, -2(%rdx,%rax)
-; X64-NEXT: movzbl -1(%rdi,%rax), %r8d
-; X64-NEXT: movzbl -1(%rsi,%rax), %ecx
-; X64-NEXT: addl %r8d, %ecx
-; X64-NEXT: movb %cl, -1(%rdx,%rax)
-; X64-NEXT: movzbl (%rdi,%rax), %r8d
-; X64-NEXT: movzbl (%rsi,%rax), %ecx
-; X64-NEXT: addl %r8d, %ecx
-; X64-NEXT: movb %cl, (%rdx,%rax)
+; X64-NEXT: movzbl -3(%rdi,%rax), %ecx
+; X64-NEXT: movzbl -3(%rsi,%rax), %r8d
+; X64-NEXT: addl %ecx, %r8d
+; X64-NEXT: movb %r8b, -3(%rdx,%rax)
+; X64-NEXT: movzbl -2(%rdi,%rax), %ecx
+; X64-NEXT: movzbl -2(%rsi,%rax), %r8d
+; X64-NEXT: addl %ecx, %r8d
+; X64-NEXT: movb %r8b, -2(%rdx,%rax)
+; X64-NEXT: movzbl -1(%rdi,%rax), %ecx
+; X64-NEXT: movzbl -1(%rsi,%rax), %r8d
+; X64-NEXT: addl %ecx, %r8d
+; X64-NEXT: movb %r8b, -1(%rdx,%rax)
+; X64-NEXT: movzbl (%rdi,%rax), %ecx
+; X64-NEXT: movzbl (%rsi,%rax), %r8d
+; X64-NEXT: addl %ecx, %r8d
+; X64-NEXT: movb %r8b, (%rdx,%rax)
; X64-NEXT: addq $4, %rax
; X64-NEXT: cmpl $403, %eax # imm = 0x193
; X64-NEXT: jne .LBB3_1
@@ -513,8 +509,8 @@ define void @testCmpZero(i8* %src, i8* %dst, i32 %srcidx, i32 %dstidx, i32 %len)
; X64: # %bb.0: # %entry
; X64-NEXT: movslq %edx, %rdx
; X64-NEXT: addq %rdx, %rdi
-; X64-NEXT: movslq %ecx, %r9
-; X64-NEXT: addq %rsi, %r9
+; X64-NEXT: movslq %ecx, %rax
+; X64-NEXT: addq %rsi, %rax
; X64-NEXT: addl %edx, %r8d
; X64-NEXT: movslq %r8d, %rcx
; X64-NEXT: subq %rdx, %rcx
@@ -522,8 +518,8 @@ define void @testCmpZero(i8* %src, i8* %dst, i32 %srcidx, i32 %dstidx, i32 %len)
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB5_1: # %for.body82.us
; X64-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NEXT: movzbl (%r9,%rdx,4), %eax
-; X64-NEXT: movb %al, (%rdi,%rdx)
+; X64-NEXT: movzbl (%rax,%rdx,4), %esi
+; X64-NEXT: movb %sil, (%rdi,%rdx)
; X64-NEXT: incq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB5_1
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
index 240eb8c197444..b8f62b8c06a66 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
@@ -21,9 +21,9 @@
; LSR should prefer complicated address to additonal add instructions.
; CHECK: LBB0_2:
-; CHECK-NEXT: movl (%r{{.+}},
-; CHECK-NEXT: addl (%r{{.+}},
-; CHECK-NEXT: movl %e{{.+}}, (%r{{.+}},
+; CHECK-NEXT: movl (%r{{.+}},{{.*}}), [[REG:%[a-z0-9]+]]
+; CHECK-NEXT: addl (%r{{.+}},{{.*}}), [[REG]]
+; CHECK-NEXT: movl [[REG]], (%{{.*}})
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/tools/llvm-locstats/locstats.ll b/llvm/test/tools/llvm-locstats/locstats.ll
index df1f24ee0618a..e980b227ff357 100644
--- a/llvm/test/tools/llvm-locstats/locstats.ll
+++ b/llvm/test/tools/llvm-locstats/locstats.ll
@@ -11,8 +11,8 @@
; LOCSTATS: [30%,40%) 0 0%
; LOCSTATS: [40%,50%) 0 0%
; LOCSTATS: [50%,60%) 0 0%
-; LOCSTATS: [60%,70%) 1 11%
-; LOCSTATS: [70%,80%) 0 0%
+; LOCSTATS: [60%,70%) 0 0%
+; LOCSTATS: [70%,80%) 1 11%
; LOCSTATS: [80%,90%) 2 22%
; LOCSTATS: [90%,100%) 1 11%
; LOCSTATS: 100% 4 44%
More information about the llvm-commits
mailing list